1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Overlap comms and compute options in wilson kernels

This commit is contained in:
paboyle 2017-02-07 01:37:10 -05:00
parent 71ac2e7940
commit 2c246551d0
11 changed files with 729 additions and 366 deletions

View File

@ -180,26 +180,31 @@ namespace QCD {
const std::vector<int> &distances) : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) const std::vector<int> &distances) : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances)
{ }; { };
template < class compressor> template < class compressor>
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
{ {
std::vector<std::vector<CommsRequest_t> > reqs; std::vector<std::vector<CommsRequest_t> > reqs;
HaloExchangeOptGather(source,compress);
this->CommunicateBegin(reqs);
this->calls++;
this->CommunicateComplete(reqs);
this->CommsMerge();
}
template < class compressor>
void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress)
{
this->calls++;
this->Mergers.resize(0); this->Mergers.resize(0);
this->Packets.resize(0); this->Packets.resize(0);
this->HaloGatherOpt(source,compress); this->HaloGatherOpt(source,compress);
this->CommunicateBegin(reqs);
this->CommunicateComplete(reqs);
this->CommsMerge(); // spins
this->calls++;
} }
template < class compressor> template < class compressor>
void HaloGatherOpt(const Lattice<vobj> &source,compressor &compress) void HaloGatherOpt(const Lattice<vobj> &source,compressor &compress)
{ {
int face_idx=0; this->_grid->StencilBarrier();
// conformable(source._grid,_grid); // conformable(source._grid,_grid);
assert(source._grid==this->_grid); assert(source._grid==this->_grid);
this->halogtime-=usecond(); this->halogtime-=usecond();
@ -222,7 +227,9 @@ namespace QCD {
// compress.Point(point); // compress.Point(point);
// HaloGatherDir(source,compress,point,face_idx); // HaloGatherDir(source,compress,point,face_idx);
// } // }
int face_idx=0;
if ( dag ) { if ( dag ) {
std::cout << " Optimised Dagger compress " <<std::endl;
this->HaloGatherDir(source,XpCompress,Xp,face_idx); this->HaloGatherDir(source,XpCompress,Xp,face_idx);
this->HaloGatherDir(source,YpCompress,Yp,face_idx); this->HaloGatherDir(source,YpCompress,Yp,face_idx);
this->HaloGatherDir(source,ZpCompress,Zp,face_idx); this->HaloGatherDir(source,ZpCompress,Zp,face_idx);

View File

@ -224,7 +224,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
//////////////////////// ////////////////////////
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for (int sss = 0; sss < B._grid->oSites(); sss++) { for (int sss = 0; sss < B._grid->oSites(); sss++) {
Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu, Kernels::DhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu,
gamma); gamma);
} }
@ -335,8 +335,7 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for (int sss = 0; sss < in._grid->oSites(); sss++) { for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma);
dirdisp, gamma);
} }
}; };
@ -353,14 +352,12 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
if (dag == DaggerYes) { if (dag == DaggerYes) {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for (int sss = 0; sss < in._grid->oSites(); sss++) { for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
out);
} }
} else { } else {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for (int sss = 0; sss < in._grid->oSites(); sss++) { for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DiracOptDhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
out);
} }
} }
}; };

View File

@ -182,34 +182,34 @@ void WilsonFermion5D<Impl>::Report(void)
std::vector<int> latt = GridDefaultLatt(); std::vector<int> latt = GridDefaultLatt();
RealD volume = Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu]; RealD volume = Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = _FourDimGrid->_Nprocessors; RealD NP = _FourDimGrid->_Nprocessors;
RealD NN = _FourDimGrid->NodeCount();
if ( DhopCalls > 0 ) { if ( DhopCalls > 0 ) {
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls : " << DhopCalls << std::endl; std::cout << GridLogMessage << "WilsonFermion5D Number of DhopEO Calls : " << DhopCalls << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime<< " us" << std::endl; std::cout << GridLogMessage << "WilsonFermion5D TotalTime /Calls : " << DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : " << DhopCommTime / DhopCalls << " us" << std::endl; std::cout << GridLogMessage << "WilsonFermion5D CommTime /Calls : " << DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Total Compute time : " << DhopComputeTime << " us" << std::endl; std::cout << GridLogMessage << "WilsonFermion5D FaceTime /Calls : " << DhopFaceTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl; std::cout << GridLogMessage << "WilsonFermion5D ComputeTime1/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D ComputeTime2/Calls : " << DhopComputeTime2/ DhopCalls << " us" << std::endl;
RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1344*volume*DhopCalls/(DhopComputeTime+DhopCommTime)/2; // 2 for red black counting RealD Fullmflops = 1344*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
} }
if ( DerivCalls > 0 ) { if ( DerivCalls > 0 ) {
std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl; std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls : " <<DerivCalls <<std::endl; std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls : " <<DerivCalls <<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " <<DerivCommTime <<" us"<<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : " <<DerivCommTime/DerivCalls<<" us" <<std::endl; std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Total Compute time : " <<DerivComputeTime <<" us"<<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl; std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Total Dhop Compute time : " <<DerivDhopComputeTime <<" us"<<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl; std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime; RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
@ -232,6 +232,9 @@ void WilsonFermion5D<Impl>::ZeroCounters(void) {
DhopCalls = 0; DhopCalls = 0;
DhopCommTime = 0; DhopCommTime = 0;
DhopComputeTime = 0; DhopComputeTime = 0;
DhopComputeTime2= 0;
DhopFaceTime = 0;
DhopTotalTime = 0;
DerivCalls = 0; DerivCalls = 0;
DerivCommTime = 0; DerivCommTime = 0;
@ -277,7 +280,7 @@ PARALLEL_FOR_LOOP
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
int sU=ss; int sU=ss;
int sF = s+Ls*sU; int sF = s+Ls*sU;
Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.CommBuf(),sF,sU,in,out,dirdisp,gamma); Kernels::DhopDir(Stencil,Umu,Stencil.CommBuf(),sF,sU,in,out,dirdisp,gamma);
} }
} }
}; };
@ -329,7 +332,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
assert(sF < B._grid->oSites()); assert(sF < B._grid->oSites());
assert(sU < U._grid->oSites()); assert(sU < U._grid->oSites());
Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sF, sU, B, Btilde, mu, gamma); Kernels::DhopDir(st, U, st.CommBuf(), sF, sU, B, Btilde, mu, gamma);
//////////////////////////// ////////////////////////////
// spin trace outer product // spin trace outer product
@ -396,6 +399,86 @@ template<class Impl>
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo, void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U, DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) const FermionField &in, FermionField &out,int dag)
{
DhopTotalTime-=usecond();
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
else
#endif
DhopInternalSerialComms(st,lo,U,in,out,dag);
DhopTotalTime+=usecond();
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag)
{
#ifdef GRID_OMP
// assert((dag==DaggerNo) ||(dag==DaggerYes));
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
Compressor compressor(dag);
int LLs = in._grid->_rdimensions[0];
int len = U._grid->oSites();
DhopFaceTime-=usecond();
st.HaloExchangeOptGather(in,compressor);
DhopFaceTime+=usecond();
std::vector<std::vector<CommsRequest_t> > reqs;
#pragma omp parallel
{
int nthreads = omp_get_num_threads();
int me = omp_get_thread_num();
int myoff, mywork;
GridThread::GetWork(len,me-1,mywork,myoff,nthreads-1);
int sF = LLs * myoff;
if ( me == 0 ) {
DhopCommTime-=usecond();
st.CommunicateBegin(reqs);
st.CommunicateComplete(reqs);
DhopCommTime+=usecond();
} else {
// Interior links in stencil
if ( me==1 ) DhopComputeTime-=usecond();
if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
else Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
if ( me==1 ) DhopComputeTime+=usecond();
}
}
DhopFaceTime-=usecond();
st.CommsMerge();
DhopFaceTime+=usecond();
#pragma omp parallel
{
int nthreads = omp_get_num_threads();
int me = omp_get_thread_num();
int myoff, mywork;
GridThread::GetWork(len,me,mywork,myoff,nthreads);
int sF = LLs * myoff;
// Exterior links in stencil
if ( me==0 ) DhopComputeTime2-=usecond();
if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1);
else Kernels::DhopSite (st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1);
if ( me==0 ) DhopComputeTime2+=usecond();
}// end parallel region
#else
assert(0);
#endif
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag)
{ {
// assert((dag==DaggerNo) ||(dag==DaggerYes)); // assert((dag==DaggerNo) ||(dag==DaggerYes));
Compressor compressor(dag); Compressor compressor(dag);
@ -408,12 +491,30 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DhopComputeTime-=usecond(); DhopComputeTime-=usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion // Dhop takes the 4d grid from U, and makes a 5d index for fermion
if (dag == DaggerYes) { if (dag == DaggerYes) {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for (int ss = 0; ss < U._grid->oSites(); ss++) { for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU = ss; int sU = ss;
int sF = LLs * sU; int sF = LLs * sU;
Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sF, sU, LLs, 1, in, out); Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
}
} else {
PARALLEL_FOR_LOOP
for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU = ss;
int sF = LLs * sU;
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
}
}
/*
if (dag == DaggerYes) {
PARALLEL_FOR_LOOP
for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU = ss;
int sF = LLs * sU;
Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
} }
#ifdef AVX512_SWITCHOFF #ifdef AVX512_SWITCHOFF
} else if (stat.is_init() ) { } else if (stat.is_init() ) {
@ -430,31 +531,35 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
for(int ss=0;ss<U._grid->oSites();ss++) { for(int ss=0;ss<U._grid->oSites();ss++) {
int sU=ss; int sU=ss;
int sF=LLs*sU; int sF=LLs*sU;
Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out); Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
} }
stat.exit(mythread); stat.exit(mythread);
} }
stat.accum(nthreads); stat.accum(nthreads);
#endif #endif
} else { } else {
#if 0 #if 1
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for (int ss = 0; ss < U._grid->oSites(); ss++) { for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU = ss; int sU = ss;
int sF = LLs * sU; int sF = LLs * sU;
Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out); Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
} }
#else #else
#ifdef GRID_OMP
#pragma omp parallel #pragma omp parallel
#endif
{ {
int len = U._grid->oSites(); int len = U._grid->oSites();
int me, myoff,mywork; int me, myoff,mywork;
GridThread::GetWorkBarrier(len,me, mywork,myoff); GridThread::GetWorkBarrier(len,me, mywork,myoff);
int sF = LLs * myoff; int sF = LLs * myoff;
Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out); Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out);
} }
#endif #endif
} }
*/
DhopComputeTime+=usecond(); DhopComputeTime+=usecond();
} }

View File

@ -82,6 +82,9 @@ namespace QCD {
double DhopCalls; double DhopCalls;
double DhopCommTime; double DhopCommTime;
double DhopComputeTime; double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
double DhopTotalTime;
double DerivCalls; double DerivCalls;
double DerivCommTime; double DerivCommTime;
@ -146,6 +149,20 @@ namespace QCD {
FermionField &out, FermionField &out,
int dag); int dag);
void DhopInternalOverlappedComms(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out,
int dag);
void DhopInternalSerialComms(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out,
int dag);
// Constructors // Constructors
WilsonFermion5D(GaugeField &_Umu, WilsonFermion5D(GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,

View File

@ -32,8 +32,8 @@ directory
namespace Grid { namespace Grid {
namespace QCD { namespace QCD {
int WilsonKernelsStatic::Opt; int WilsonKernelsStatic::Opt = WilsonKernelsStatic::OptGeneric;
int WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute;
#ifdef QPX #ifdef QPX
#include <spi/include/kernel/location.h> #include <spi/include/kernel/location.h>
@ -87,9 +87,10 @@ WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
//////////////////////////////////////////// ////////////////////////////////////////////
template <class Impl> template <class Impl>
void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, void WilsonKernels<Impl>::GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionField &in, FermionField &out) { int sU, const FermionField &in, FermionField &out,
int interior,int exterior) {
SiteHalfSpinor tmp; SiteHalfSpinor tmp;
SiteHalfSpinor chi; SiteHalfSpinor chi;
SiteHalfSpinor *chi_p; SiteHalfSpinor *chi_p;
@ -263,9 +264,9 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOr
// Need controls to do interior, exterior, or both // Need controls to do interior, exterior, or both
template <class Impl> template <class Impl>
void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, void WilsonKernels<Impl>::GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionField &in, FermionField &out) { int sU, const FermionField &in, FermionField &out,int interior,int exterior) {
SiteHalfSpinor tmp; SiteHalfSpinor tmp;
SiteHalfSpinor chi; SiteHalfSpinor chi;
SiteHalfSpinor *chi_p; SiteHalfSpinor *chi_p;
@ -438,7 +439,7 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder
}; };
template <class Impl> template <class Impl>
void WilsonKernels<Impl>::DiracOptDhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF, void WilsonKernels<Impl>::DhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF,
int sU, const FermionField &in, FermionField &out, int dir, int gamma) { int sU, const FermionField &in, FermionField &out, int dir, int gamma) {
SiteHalfSpinor tmp; SiteHalfSpinor tmp;

View File

@ -43,8 +43,10 @@ void bgq_l1p_optimisation(int mode);
class WilsonKernelsStatic { class WilsonKernelsStatic {
public: public:
enum { OptGeneric, OptHandUnroll, OptInlineAsm }; enum { OptGeneric, OptHandUnroll, OptInlineAsm };
enum { CommsAndCompute, CommsThenCompute };
// S-direction is INNERMOST and takes no part in the parity. // S-direction is INNERMOST and takes no part in the parity.
static int Opt; // these are a temporary hack static int Opt; // these are a temporary hack
static int Comms; // these are a temporary hack
}; };
template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic {
@ -57,20 +59,23 @@ public:
template <bool EnableBool = true> template <bool EnableBool = true>
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1)
{ {
bgq_l1p_optimisation(1); bgq_l1p_optimisation(1);
switch(Opt) { switch(Opt) {
#if defined(AVX512) || defined (QPX) #if defined(AVX512) || defined (QPX)
case OptInlineAsm: case OptInlineAsm:
WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out); if(interior&&exterior) WilsonKernels<Impl>::AsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
else if (interior) WilsonKernels<Impl>::AsmDhopSiteInt(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
else if (exterior) WilsonKernels<Impl>::AsmDhopSiteExt(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
else assert(0);
break; break;
#endif #endif
case OptHandUnroll: case OptHandUnroll:
for (int site = 0; site < Ns; site++) { for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) { for (int s = 0; s < Ls; s++) {
WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out); if( exterior) WilsonKernels<Impl>::HandDhopSite(st,lo,U,buf,sF,sU,in,out,interior,exterior);
sF++; sF++;
} }
sU++; sU++;
@ -79,7 +84,7 @@ public:
case OptGeneric: case OptGeneric:
for (int site = 0; site < Ns; site++) { for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) { for (int s = 0; s < Ls; s++) {
WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out); if( exterior) WilsonKernels<Impl>::GenericDhopSite(st,lo,U,buf,sF,sU,in,out,interior,exterior);
sF++; sF++;
} }
sU++; sU++;
@ -93,12 +98,12 @@ public:
template <bool EnableBool = true> template <bool EnableBool = true>
typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1 ) {
// no kernel choice // no kernel choice
for (int site = 0; site < Ns; site++) { for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) { for (int s = 0; s < Ls; s++) {
WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out); if( exterior) WilsonKernels<Impl>::GenericDhopSite(st, lo, U, buf, sF, sU, in, out,interior,exterior);
sF++; sF++;
} }
sU++; sU++;
@ -107,20 +112,23 @@ public:
template <bool EnableBool = true> template <bool EnableBool = true>
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) {
bgq_l1p_optimisation(1); bgq_l1p_optimisation(1);
switch(Opt) { switch(Opt) {
#if defined(AVX512) || defined (QPX) #if defined(AVX512) || defined (QPX)
case OptInlineAsm: case OptInlineAsm:
WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out); if(interior&&exterior) WilsonKernels<Impl>::AsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
else if (interior) WilsonKernels<Impl>::AsmDhopSiteDagInt(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
else if (exterior) WilsonKernels<Impl>::AsmDhopSiteDagExt(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
else assert(0);
break; break;
#endif #endif
case OptHandUnroll: case OptHandUnroll:
for (int site = 0; site < Ns; site++) { for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) { for (int s = 0; s < Ls; s++) {
WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out); if( exterior) WilsonKernels<Impl>::HandDhopSiteDag(st,lo,U,buf,sF,sU,in,out,interior,exterior);
sF++; sF++;
} }
sU++; sU++;
@ -129,7 +137,7 @@ public:
case OptGeneric: case OptGeneric:
for (int site = 0; site < Ns; site++) { for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) { for (int s = 0; s < Ls; s++) {
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out); if( exterior) WilsonKernels<Impl>::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out,interior,exterior);
sF++; sF++;
} }
sU++; sU++;
@ -143,40 +151,53 @@ public:
template <bool EnableBool = true> template <bool EnableBool = true>
typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type
DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf, DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) {
for (int site = 0; site < Ns; site++) { for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) { for (int s = 0; s < Ls; s++) {
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out); if( exterior) WilsonKernels<Impl>::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out,interior,exterior);
sF++; sF++;
} }
sU++; sU++;
} }
} }
void DiracOptDhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf, void DhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma); int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma);
private: private:
// Specialised variants // Specialised variants
void DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, void GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out); int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior);
void DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, void GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out); int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior);
void DiracOptAsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
void DiracOptAsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, void AsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
void DiracOptHandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, void AsmDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out); int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
void DiracOptHandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, void AsmDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out); int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
void AsmDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
void AsmDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
void HandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior);
void HandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior);
public: public:

View File

@ -36,18 +36,47 @@ Author: Guido Cossu <guido.cossu@ed.ac.uk>
namespace Grid { namespace Grid {
namespace QCD { namespace QCD {
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
// Default to no assembler implementation // Default to no assembler implementation
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
template<class Impl> void template<class Impl> void
WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, WilsonKernels<Impl >::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
{ {
assert(0); assert(0);
} }
template<class Impl> void template<class Impl> void
WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, WilsonKernels<Impl >::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
{
assert(0);
}
template<class Impl> void
WilsonKernels<Impl >::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
{
assert(0);
}
template<class Impl> void
WilsonKernels<Impl >::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
{
assert(0);
}
template<class Impl> void
WilsonKernels<Impl >::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
{
assert(0);
}
template<class Impl> void
WilsonKernels<Impl >::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
{ {
assert(0); assert(0);
@ -57,11 +86,22 @@ WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,
#include <qcd/action/fermion/WilsonKernelsAsmQPX.h> #include <qcd/action/fermion/WilsonKernelsAsmQPX.h>
#define INSTANTIATE_ASM(A)\ #define INSTANTIATE_ASM(A)\
template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\ template void WilsonKernels<A>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\ int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
\ \
template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\ template void WilsonKernels<A>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\ int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
template void WilsonKernels<A>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
\
template void WilsonKernels<A>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
template void WilsonKernels<A>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
\
template void WilsonKernels<A>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
INSTANTIATE_ASM(WilsonImplF); INSTANTIATE_ASM(WilsonImplF);
INSTANTIATE_ASM(WilsonImplD); INSTANTIATE_ASM(WilsonImplD);

View File

@ -53,12 +53,36 @@ static Vector<vComplexF> signsF;
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
#define COMPLEX_SIGNS(isigns) vComplexF *isigns = &signsF[0]; #define COMPLEX_SIGNS(isigns) vComplexF *isigns = &signsF[0];
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
// XYZT vectorised, undag Kernel, single // XYZT vectorised, undag Kernel, single
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
#undef KERNEL_DAG #undef KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
template<> void template<> void
WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
@ -66,8 +90,27 @@ WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder &
// XYZT vectorised, dag Kernel, single // XYZT vectorised, dag Kernel, single
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
#define KERNEL_DAG #define KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
template<> void template<> void
WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
@ -80,8 +123,29 @@ WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder
// Ls vectorised, undag Kernel, single // Ls vectorised, undag Kernel, single
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
#undef KERNEL_DAG #undef KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
template<> void template<> void
WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, WilsonKernels<DomainWallVec5dImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
#undef MULT_2SPIN
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)
template<> void
WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
@ -89,10 +153,30 @@ WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,Lebesgu
// Ls vectorised, dag Kernel, single // Ls vectorised, dag Kernel, single
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
#define KERNEL_DAG #define KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
template<> void template<> void
WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
template<> void
WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef COMPLEX_SIGNS #undef COMPLEX_SIGNS
#undef MAYBEPERM #undef MAYBEPERM
#undef MULT_2SPIN #undef MULT_2SPIN
@ -110,50 +194,129 @@ static int signInitD = setupSigns(signsD);
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0]; #define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0];
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
// XYZT Vectorised, undag Kernel, double // XYZT vectorised, undag Kernel, single
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
#undef KERNEL_DAG #undef KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
template<> void template<> void
WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
/////////////////////////////////////////////////////////////////
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
// XYZT Vectorised, dag Kernel, double // XYZT vectorised, dag Kernel, single
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
#define KERNEL_DAG #define KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
template<> void template<> void
WilsonKernels<WilsonImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
/////////////////////////////////////////////////////////////////
#undef MAYBEPERM #undef MAYBEPERM
#undef MULT_2SPIN #undef MULT_2SPIN
#define MAYBEPERM(A,B) #define MAYBEPERM(A,B)
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
/////////////////////////////////////////////////////////////////
// Ls vectorised, undag Kernel, double
/////////////////////////////////////////////////////////////////
#undef KERNEL_DAG
template<> void
WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
/////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
// Ls vectorised, dag Kernel, double // Ls vectorised, undag Kernel, single
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
#define KERNEL_DAG #undef KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
template<> void template<> void
WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, WilsonKernels<DomainWallVec5dImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
#undef MULT_2SPIN
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)
template<> void
WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
// Ls vectorised, dag Kernel, single
/////////////////////////////////////////////////////////////////
#define KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
template<> void
WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef COMPLEX_SIGNS #undef COMPLEX_SIGNS
#undef MAYBEPERM #undef MAYBEPERM

View File

@ -1,259 +1,267 @@
#ifdef KERNEL_DAG
#define DIR0_PROJMEM(base) XP_PROJMEM(base);
#define DIR1_PROJMEM(base) YP_PROJMEM(base);
#define DIR2_PROJMEM(base) ZP_PROJMEM(base);
#define DIR3_PROJMEM(base) TP_PROJMEM(base);
#define DIR4_PROJMEM(base) XM_PROJMEM(base);
#define DIR5_PROJMEM(base) YM_PROJMEM(base);
#define DIR6_PROJMEM(base) ZM_PROJMEM(base);
#define DIR7_PROJMEM(base) TM_PROJMEM(base);
#define DIR0_RECON XP_RECON
#define DIR1_RECON YP_RECON_ACCUM
#define DIR2_RECON ZP_RECON_ACCUM
#define DIR3_RECON TP_RECON_ACCUM
#define DIR4_RECON XM_RECON_ACCUM
#define DIR5_RECON YM_RECON_ACCUM
#define DIR6_RECON ZM_RECON_ACCUM
#define DIR7_RECON TM_RECON_ACCUM
#else
#define DIR0_PROJMEM(base) XM_PROJMEM(base);
#define DIR1_PROJMEM(base) YM_PROJMEM(base);
#define DIR2_PROJMEM(base) ZM_PROJMEM(base);
#define DIR3_PROJMEM(base) TM_PROJMEM(base);
#define DIR4_PROJMEM(base) XP_PROJMEM(base);
#define DIR5_PROJMEM(base) YP_PROJMEM(base);
#define DIR6_PROJMEM(base) ZP_PROJMEM(base);
#define DIR7_PROJMEM(base) TP_PROJMEM(base);
#define DIR0_RECON XM_RECON
#define DIR1_RECON YM_RECON_ACCUM
#define DIR2_RECON ZM_RECON_ACCUM
#define DIR3_RECON TM_RECON_ACCUM
#define DIR4_RECON XP_RECON_ACCUM
#define DIR5_RECON YP_RECON_ACCUM
#define DIR6_RECON ZP_RECON_ACCUM
#define DIR7_RECON TP_RECON_ACCUM
#endif
////////////////////////////////////////////////////////////////////////////////
// Comms then compute kernel
////////////////////////////////////////////////////////////////////////////////
#ifdef INTERIOR_AND_EXTERIOR
#define ZERO_NMU(A)
#define INTERIOR_BLOCK_XP(a,b,PERMUTE_DIR,PROJMEM,RECON) INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON)
#define EXTERIOR_BLOCK_XP(a,b,RECON) EXTERIOR_BLOCK(a,b,RECON)
#define INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) \
LOAD64(%r10,isigns); \
PROJMEM(base); \
MAYBEPERM(PERMUTE_DIR,perm);
#define EXTERIOR_BLOCK(a,b,RECON) \
LOAD_CHI(base);
#define COMMON_BLOCK(a,b,RECON) \
base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; \
PREFETCH_CHIMU(base); \
MULT_2SPIN_DIR_PF(a,basep); \
LOAD64(%r10,isigns); \
RECON;
#define RESULT(base,basep) SAVE_RESULT(base,basep);
#endif
////////////////////////////////////////////////////////////////////////////////
// Pre comms kernel -- prefetch like normal because it is mostly right
////////////////////////////////////////////////////////////////////////////////
#ifdef INTERIOR
#define COMMON_BLOCK(a,b,RECON)
#define ZERO_NMU(A)
// No accumulate for DIR0
#define EXTERIOR_BLOCK_XP(a,b,RECON) \
ZERO_PSI; \
base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++;
#define EXTERIOR_BLOCK(a,b,RECON) \
base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++;
#define INTERIOR_BLOCK_XP(a,b,PERMUTE_DIR,PROJMEM,RECON) INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON)
#define INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) \
LOAD64(%r10,isigns); \
PROJMEM(base); \
MAYBEPERM(PERMUTE_DIR,perm); \
base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; \
PREFETCH_CHIMU(base); \
MULT_2SPIN_DIR_PF(a,basep); \
LOAD64(%r10,isigns); \
RECON;
#define RESULT(base,basep) SAVE_RESULT(base,basep);
#endif
////////////////////////////////////////////////////////////////////////////////
// Post comms kernel
////////////////////////////////////////////////////////////////////////////////
#ifdef EXTERIOR
#define ZERO_NMU(A) nmu=0;
#define INTERIOR_BLOCK_XP(a,b,PERMUTE_DIR,PROJMEM,RECON) \
ZERO_PSI; base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++;
#define EXTERIOR_BLOCK_XP(a,b,RECON) EXTERIOR_BLOCK(a,b,RECON)
#define INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) \
base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++;
#define EXTERIOR_BLOCK(a,b,RECON) \
nmu++; \
LOAD_CHI(base); \
MULT_2SPIN_DIR_PF(a,base); \
base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; \
LOAD64(%r10,isigns); \
RECON;
#define COMMON_BLOCK(a,b,RECON)
#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
#endif
{ {
int nmu;
int local,perm, ptype; int local,perm, ptype;
uint64_t base; uint64_t base;
uint64_t basep; uint64_t basep;
const uint64_t plocal =(uint64_t) & in._odata[0]; const uint64_t plocal =(uint64_t) & in._odata[0];
// vComplexF isigns[2] = { signs[0], signs[1] };
//COMPLEX_TYPE is vComplexF of vComplexD depending
//on the chosen precision
COMPLEX_SIGNS(isigns); COMPLEX_SIGNS(isigns);
MASK_REGS; MASK_REGS;
int nmax=U._grid->oSites(); int nmax=U._grid->oSites();
for(int site=0;site<Ns;site++) { for(int site=0;site<Ns;site++) {
int sU =lo.Reorder(ssU); int sU =lo.Reorder(ssU);
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
LOCK_GAUGE(0);
int ssn=ssU+1;
if(ssn>=nmax) ssn=0;
int sUn=lo.Reorder(ssn); int sUn=lo.Reorder(ssn);
#ifndef EXTERIOR
LOCK_GAUGE(0);
#endif
for(int s=0;s<Ls;s++) { for(int s=0;s<Ls;s++) {
ss =sU*Ls+s; ss =sU*Ls+s;
ssn=sUn*Ls+s; ssn=sUn*Ls+s;
////////////////////////////////
// Xp
////////////////////////////////
int ent=ss*8;// 2*Ndim int ent=ss*8;// 2*Ndim
int nent=ssn*8; int nent=ssn*8;
PF_GAUGE(Xp); ZERO_NMU(0);
base = st.GetInfo(ptype,local,perm,Xp,ent,plocal); ent++; base = st.GetInfo(ptype,local,perm,Xp,ent,plocal); ent++;
#ifndef EXTERIOR
PF_GAUGE(Xp);
PREFETCH1_CHIMU(base); PREFETCH1_CHIMU(base);
#endif
////////////////////////////////
// Xp
////////////////////////////////
basep = st.GetPFInfo(nent,plocal); nent++; basep = st.GetPFInfo(nent,plocal); nent++;
if ( local ) { if ( local ) {
LOAD64(%r10,isigns); INTERIOR_BLOCK_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJMEM,DIR0_RECON);
#ifdef KERNEL_DAG
XP_PROJMEM(base);
#else
XM_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR3,perm);
} else { } else {
LOAD_CHI(base); EXTERIOR_BLOCK_XP(Xp,Yp,DIR0_RECON);
} }
base = st.GetInfo(ptype,local,perm,Yp,ent,plocal); ent++; COMMON_BLOCK(Xp,Yp,DIR0_RECON);
PREFETCH_CHIMU(base);
{
MULT_2SPIN_DIR_PFXP(Xp,basep);
}
LOAD64(%r10,isigns);
#ifdef KERNEL_DAG
XP_RECON;
#else
XM_RECON;
#endif
//////////////////////////////// ////////////////////////////////
// Yp // Yp
//////////////////////////////// ////////////////////////////////
basep = st.GetPFInfo(nent,plocal); nent++; basep = st.GetPFInfo(nent,plocal); nent++;
if ( local ) { if ( local ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit INTERIOR_BLOCK(Yp,Zp,PERMUTE_DIR2,DIR1_PROJMEM,DIR1_RECON);
#ifdef KERNEL_DAG
YP_PROJMEM(base);
#else
YM_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR2,perm);
} else { } else {
LOAD_CHI(base); EXTERIOR_BLOCK(Yp,Zp,DIR1_RECON);
} }
base = st.GetInfo(ptype,local,perm,Zp,ent,plocal); ent++; COMMON_BLOCK(Yp,Zp,DIR1_RECON);
PREFETCH_CHIMU(base);
{
MULT_2SPIN_DIR_PFYP(Yp,basep);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
YP_RECON_ACCUM;
#else
YM_RECON_ACCUM;
#endif
//////////////////////////////// ////////////////////////////////
// Zp // Zp
//////////////////////////////// ////////////////////////////////
basep = st.GetPFInfo(nent,plocal); nent++; basep = st.GetPFInfo(nent,plocal); nent++;
if ( local ) { if ( local ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit INTERIOR_BLOCK(Zp,Tp,PERMUTE_DIR1,DIR2_PROJMEM,DIR2_RECON);
#ifdef KERNEL_DAG
ZP_PROJMEM(base);
#else
ZM_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR1,perm);
} else { } else {
LOAD_CHI(base); EXTERIOR_BLOCK(Zp,Tp,DIR2_RECON);
} }
base = st.GetInfo(ptype,local,perm,Tp,ent,plocal); ent++; COMMON_BLOCK(Zp,Tp,DIR2_RECON);
PREFETCH_CHIMU(base);
{
MULT_2SPIN_DIR_PFZP(Zp,basep);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
ZP_RECON_ACCUM;
#else
ZM_RECON_ACCUM;
#endif
//////////////////////////////// ////////////////////////////////
// Tp // Tp
//////////////////////////////// ////////////////////////////////
basep = st.GetPFInfo(nent,plocal); nent++; basep = st.GetPFInfo(nent,plocal); nent++;
if ( local ) { if ( local ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit INTERIOR_BLOCK(Tp,Xm,PERMUTE_DIR0,DIR3_PROJMEM,DIR3_RECON);
#ifdef KERNEL_DAG
TP_PROJMEM(base);
#else
TM_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR0,perm);
} else { } else {
LOAD_CHI(base); EXTERIOR_BLOCK(Tp,Xm,DIR3_RECON);
} }
base = st.GetInfo(ptype,local,perm,Xm,ent,plocal); ent++; COMMON_BLOCK(Tp,Xm,DIR3_RECON);
PREFETCH_CHIMU(base);
{
MULT_2SPIN_DIR_PFTP(Tp,basep);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
TP_RECON_ACCUM;
#else
TM_RECON_ACCUM;
#endif
//////////////////////////////// ////////////////////////////////
// Xm // Xm
//////////////////////////////// ////////////////////////////////
#ifndef STREAM_STORE
basep= (uint64_t) &out._odata[ss];
#endif
// basep= st.GetPFInfo(nent,plocal); nent++; // basep= st.GetPFInfo(nent,plocal); nent++;
if ( local ) { if ( local ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit INTERIOR_BLOCK(Xm,Ym,PERMUTE_DIR3,DIR4_PROJMEM,DIR4_RECON);
#ifdef KERNEL_DAG
XM_PROJMEM(base);
#else
XP_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR3,perm);
} else { } else {
LOAD_CHI(base); EXTERIOR_BLOCK(Xm,Ym,DIR4_RECON);
} }
base = st.GetInfo(ptype,local,perm,Ym,ent,plocal); ent++; COMMON_BLOCK(Xm,Ym,DIR4_RECON);
PREFETCH_CHIMU(base);
{
MULT_2SPIN_DIR_PFXM(Xm,basep);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
XM_RECON_ACCUM;
#else
XP_RECON_ACCUM;
#endif
//////////////////////////////// ////////////////////////////////
// Ym // Ym
//////////////////////////////// ////////////////////////////////
basep= st.GetPFInfo(nent,plocal); nent++; basep= st.GetPFInfo(nent,plocal); nent++;
if ( local ) { if ( local ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit INTERIOR_BLOCK(Ym,Zm,PERMUTE_DIR2,DIR5_PROJMEM,DIR5_RECON);
#ifdef KERNEL_DAG
YM_PROJMEM(base);
#else
YP_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR2,perm);
} else { } else {
LOAD_CHI(base); EXTERIOR_BLOCK(Ym,Zm,DIR5_RECON);
} }
base = st.GetInfo(ptype,local,perm,Zm,ent,plocal); ent++; COMMON_BLOCK(Ym,Zm,DIR5_RECON);
PREFETCH_CHIMU(base);
{
MULT_2SPIN_DIR_PFYM(Ym,basep);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
YM_RECON_ACCUM;
#else
YP_RECON_ACCUM;
#endif
//////////////////////////////// ////////////////////////////////
// Zm // Zm
//////////////////////////////// ////////////////////////////////
basep= st.GetPFInfo(nent,plocal); nent++; basep= st.GetPFInfo(nent,plocal); nent++;
if ( local ) { if ( local ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit INTERIOR_BLOCK(Zm,Tm,PERMUTE_DIR1,DIR6_PROJMEM,DIR6_RECON);
#ifdef KERNEL_DAG
ZM_PROJMEM(base);
#else
ZP_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR1,perm);
} else { } else {
LOAD_CHI(base); EXTERIOR_BLOCK(Zm,Tm,DIR6_RECON);
} }
base = st.GetInfo(ptype,local,perm,Tm,ent,plocal); ent++; COMMON_BLOCK(Zm,Tm,DIR6_RECON);
PREFETCH_CHIMU(base);
{
MULT_2SPIN_DIR_PFZM(Zm,basep);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
ZM_RECON_ACCUM;
#else
ZP_RECON_ACCUM;
#endif
//////////////////////////////// ////////////////////////////////
// Tm // Tm
//////////////////////////////// ////////////////////////////////
basep= st.GetPFInfo(nent,plocal); nent++; basep= st.GetPFInfo(nent,plocal); nent++;
if ( local ) { if ( local ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit INTERIOR_BLOCK(Tm,Xp,PERMUTE_DIR0,DIR7_PROJMEM,DIR7_RECON);
#ifdef KERNEL_DAG
TM_PROJMEM(base);
#else
TP_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR0,perm);
} else { } else {
LOAD_CHI(base); EXTERIOR_BLOCK(Tm,Xp,DIR7_RECON);
} }
base= (uint64_t) &out._odata[ss]; COMMON_BLOCK(Tm,Xp,DIR7_RECON);
#ifndef STREAM_STORE
PREFETCH_CHIMU(base);
#endif
{
MULT_2SPIN_DIR_PFTM(Tm,basep);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
TM_RECON_ACCUM;
#else
TP_RECON_ACCUM;
#endif
base = (uint64_t) &out._odata[ss];
basep= st.GetPFInfo(nent,plocal); nent++; basep= st.GetPFInfo(nent,plocal); nent++;
SAVE_RESULT(base,basep); RESULT(base,basep);
} }
ssU++; ssU++;
UNLOCK_GAUGE(0); UNLOCK_GAUGE(0);
} }
} }
#undef DIR0_PROJMEM
#undef DIR1_PROJMEM
#undef DIR2_PROJMEM
#undef DIR3_PROJMEM
#undef DIR4_PROJMEM
#undef DIR5_PROJMEM
#undef DIR6_PROJMEM
#undef DIR7_PROJMEM
#undef DIR0_RECON
#undef DIR1_RECON
#undef DIR2_RECON
#undef DIR3_RECON
#undef DIR4_RECON
#undef DIR5_RECON
#undef DIR6_RECON
#undef DIR7_RECON
#undef EXTERIOR_BLOCK
#undef INTERIOR_BLOCK
#undef EXTERIOR_BLOCK_XP
#undef INTERIOR_BLOCK_XP
#undef COMMON_BLOCK
#undef ZERO_NMU
#undef RESULT

View File

@ -43,12 +43,16 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX(ptr,pf) #define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX(ptr,pf)
#define COMPLEX_SIGNS(isigns) #define COMPLEX_SIGNS(isigns)
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
// XYZT vectorised, undag Kernel, single // XYZT vectorised, undag Kernel, single
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
#undef KERNEL_DAG #undef KERNEL_DAG
template<> void template<> void
WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
@ -57,7 +61,7 @@ WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder &
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
#define KERNEL_DAG #define KERNEL_DAG
template<> void template<> void
WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
@ -71,7 +75,7 @@ WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
#undef KERNEL_DAG #undef KERNEL_DAG
template<> void template<> void
WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, WilsonKernels<DomainWallVec5dImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
@ -80,7 +84,7 @@ WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,Lebesgu
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
#define KERNEL_DAG #define KERNEL_DAG
template<> void template<> void
WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef MAYBEPERM #undef MAYBEPERM
@ -100,7 +104,7 @@ WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,Lebe
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
#undef KERNEL_DAG #undef KERNEL_DAG
template<> void template<> void
WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
@ -111,7 +115,7 @@ WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder &
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
#define KERNEL_DAG #define KERNEL_DAG
template<> void template<> void
WilsonKernels<WilsonImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
@ -125,7 +129,7 @@ WilsonKernels<WilsonImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
#undef KERNEL_DAG #undef KERNEL_DAG
template<> void template<> void
WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, WilsonKernels<DomainWallVec5dImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
@ -135,7 +139,7 @@ WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,Lebesgu
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
#define KERNEL_DAG #define KERNEL_DAG
template<> void template<> void
WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////

View File

@ -312,8 +312,8 @@ namespace QCD {
template<class Impl> void template<class Impl> void
WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out) int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior)
{ {
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V; typedef typename Simd::vector_type V;
@ -554,8 +554,8 @@ WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,Doub
} }
template<class Impl> template<class Impl>
void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out) int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior)
{ {
// std::cout << "Hand op Dhop "<<std::endl; // std::cout << "Hand op Dhop "<<std::endl;
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
@ -800,31 +800,31 @@ void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder
// Specialise Gparity to simple implementation // Specialise Gparity to simple implementation
//////////////////////////////////////////////// ////////////////////////////////////////////////
template<> void template<> void
WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, WilsonKernels<GparityWilsonImplF>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
SiteHalfSpinor *buf, SiteHalfSpinor *buf,
int sF,int sU,const FermionField &in, FermionField &out) int sF,int sU,const FermionField &in, FermionField &out,int internal,int external)
{ {
assert(0); assert(0);
} }
template<> void template<> void
WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, WilsonKernels<GparityWilsonImplF>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
SiteHalfSpinor *buf, SiteHalfSpinor *buf,
int sF,int sU,const FermionField &in, FermionField &out) int sF,int sU,const FermionField &in, FermionField &out,int internal,int external)
{ {
assert(0); assert(0);
} }
template<> void template<> void
WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, WilsonKernels<GparityWilsonImplD>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int sF,int sU,const FermionField &in, FermionField &out) int sF,int sU,const FermionField &in, FermionField &out,int internal,int external)
{ {
assert(0); assert(0);
} }
template<> void template<> void
WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, WilsonKernels<GparityWilsonImplD>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int sF,int sU,const FermionField &in, FermionField &out) int sF,int sU,const FermionField &in, FermionField &out,int internal,int external)
{ {
assert(0); assert(0);
} }
@ -835,10 +835,10 @@ WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,Lebes
// Need Nc=3 though // // Need Nc=3 though //
#define INSTANTIATE_THEM(A) \ #define INSTANTIATE_THEM(A) \
template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ template void WilsonKernels<A>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
int ss,int sU,const FermionField &in, FermionField &out); \ int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior); \
template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ template void WilsonKernels<A>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
int ss,int sU,const FermionField &in, FermionField &out); int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior);
INSTANTIATE_THEM(WilsonImplF); INSTANTIATE_THEM(WilsonImplF);
INSTANTIATE_THEM(WilsonImplD); INSTANTIATE_THEM(WilsonImplD);