1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-13 20:57:06 +01:00

GparityWilsonTM typedef added. Not yet tested

Conflicts:
	configure
	lib/qcd/action/fermion/WilsonKernels.h
This commit is contained in:
Jung
2016-01-25 01:36:28 -05:00
15 changed files with 5731 additions and 3557 deletions

View File

@ -206,6 +206,9 @@ typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplD> OverlapWilsonP
typedef WilsonFermion<GparityWilsonImplR> GparityWilsonFermionR;
typedef WilsonFermion<GparityWilsonImplF> GparityWilsonFermionF;
typedef WilsonFermion<GparityWilsonImplD> GparityWilsonFermionD;
typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;

View File

@ -58,7 +58,6 @@ namespace QCD {
UmuOdd (&Hgrid)
{
// Allocate the required comms buffer
comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
ImportGauge(_Umu);
}
@ -153,7 +152,7 @@ namespace QCD {
FermionField Atilde(B._grid);
Atilde = A;
st.HaloExchange(B,comm_buf,compressor);
st.HaloExchange(B,compressor);
for(int mu=0;mu<Nd;mu++){
@ -168,7 +167,7 @@ namespace QCD {
////////////////////////
PARALLEL_FOR_LOOP
for(int sss=0;sss<B._grid->oSites();sss++){
Kernels::DiracOptDhopDir(st,U,comm_buf,sss,sss,B,Btilde,mu,gamma);
Kernels::DiracOptDhopDir(st,U,st.comm_buf,sss,sss,B,Btilde,mu,gamma);
}
//////////////////////////////////////////////////
@ -274,11 +273,11 @@ PARALLEL_FOR_LOOP
Compressor compressor(dag);
Stencil.HaloExchange(in,comm_buf,compressor);
Stencil.HaloExchange(in,compressor);
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopDir(Stencil,Umu,comm_buf,sss,sss,in,out,dirdisp,gamma);
Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sss,sss,in,out,dirdisp,gamma);
}
};
@ -300,30 +299,30 @@ PARALLEL_FOR_LOOP
assert((dag==DaggerNo) ||(dag==DaggerYes));
Compressor compressor(dag);
st.HaloExchange(in,comm_buf,compressor);
st.HaloExchange(in,compressor);
if ( dag == DaggerYes ) {
if( HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out);
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out);
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
}
}
} else {
if( HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out);
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out);
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
}
}
}
@ -338,8 +337,7 @@ PARALLEL_FOR_LOOP
Compressor compressor(dag);
std::thread comms_thread = st.HaloExchangeBegin(in,comm_buf,compressor);
comms_thread.join();
auto handle = st.HaloExchangeBegin(in,compressor);
bool local = true;
bool nonlocal = false;
@ -347,28 +345,29 @@ PARALLEL_FOR_LOOP
if( HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
}
} else {
if( HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
}
}
st.HaloExchangeComplete(handle);
local = false;
nonlocal = true;
@ -376,24 +375,24 @@ PARALLEL_FOR_LOOP
if( HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
}
} else {
if( HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
}
}

View File

@ -152,9 +152,6 @@ namespace Grid {
DoubledGaugeField Umu;
DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd;
// Comms buffer
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
};

View File

@ -98,12 +98,12 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
}
// Allocate the required comms buffer
comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
ImportGauge(_Umu);
alltime=0;
commtime=0;
jointime=0;
dslashtime=0;
dslash1time=0;
}
template<class Impl>
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
@ -121,7 +121,7 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
// assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
Compressor compressor(DaggerNo);
Stencil.HaloExchange(in,comm_buf,compressor);
Stencil.HaloExchange(in,compressor);
int skip = (disp==1) ? 0 : 1;
@ -136,7 +136,7 @@ PARALLEL_FOR_LOOP
for(int s=0;s<Ls;s++){
int sU=ss;
int sF = s+Ls*sU;
Kernels::DiracOptDhopDir(Stencil,Umu,comm_buf,sF,sU,in,out,dirdisp,gamma);
Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sF,sU,in,out,dirdisp,gamma);
}
}
};
@ -159,7 +159,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
FermionField Btilde(B._grid);
FermionField Atilde(B._grid);
st.HaloExchange(B,comm_buf,compressor);
st.HaloExchange(B,compressor);
Atilde=A;
@ -184,7 +184,7 @@ PARALLEL_FOR_LOOP
assert ( sF< B._grid->oSites());
assert ( sU< U._grid->oSites());
Kernels::DiracOptDhopDir(st,U,comm_buf,sF,sU,B,Btilde,mu,gamma);
Kernels::DiracOptDhopDir(st,U,st.comm_buf,sF,sU,B,Btilde,mu,gamma);
////////////////////////////
// spin trace outer product
@ -235,22 +235,25 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
template<class Impl>
void WilsonFermion5D<Impl>::Report(void)
{
std::cout<<GridLogMessage << "******************** WilsonFermion"<<std::endl;
std::cout<<GridLogMessage << "Wilson5d time "<<alltime <<" us"<<std::endl;
std::cout<<GridLogMessage << "HaloBegin time "<<commtime <<" us"<<std::endl;
std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Dslash1 time "<<dslash1time<<" us"<<std::endl;
std::cout<<GridLogMessage << "HaloComplete time "<<jointime<<" us"<<std::endl;
std::cout<<GridLogMessage << "******************** Stencil"<<std::endl;
std::cout<<GridLogMessage << "Stencil all gather time "<<Stencil.halogtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil nosplice gather time "<<Stencil.nosplicetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil splice gather time "<<Stencil.splicetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
std::cout<<GridLogMessage << "Halo time "<<commtime <<" us"<<std::endl;
std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "join time "<<jointime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil All time "<<Stencil.halotime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil gather "<<Stencil.gathertime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil gather simd "<<Stencil.gathermtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil merge simd "<<Stencil.mergetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil spin simd "<<Stencil.spintime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
std::cout<<GridLogMessage << "Stencil nosplice time "<<Stencil.nosplicetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil gather time "<<Stencil.gathertime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil scattertime "<<Stencil.scattertime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
std::cout<<GridLogMessage << "Stencil splice time "<<Stencil.splicetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commstime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil gathremtime "<<Stencil.gathermtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil merge time "<<Stencil.mergetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil buf time "<<Stencil.buftime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil MB/s "<<(double)Stencil.comms_bytes/Stencil.commtime<<std::endl;
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil join time "<<Stencil.jointime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
}
template<class Impl>
@ -288,7 +291,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
const FermionField &in, FermionField &out,int dag)
{
// assert((dag==DaggerNo) ||(dag==DaggerYes));
alltime-=usecond();
Compressor compressor(dag);
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
@ -299,11 +302,11 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
int nwork = U._grid->oSites();
commtime -=usecond();
std::thread thr = st.HaloExchangeBegin(in,comm_buf,compressor);
auto handle = st.HaloExchangeBegin(in,compressor);
st.HaloExchangeComplete(handle);
commtime +=usecond();
jointime -=usecond();
thr.join();
jointime +=usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
@ -319,7 +322,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
}
}
} else {
@ -330,7 +333,7 @@ PARALLEL_FOR_LOOP
for(sd=0;sd<Ls;sd++){
int sU=ss;
int sF = sd+Ls*sU;
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
}
}
}
@ -362,7 +365,7 @@ PARALLEL_FOR_LOOP
sU = lo.Reorder(sU);
}
sF = s+Ls*sU;
Kernels::DiracOptAsmDhopSite(st,U,comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
}
}
}
@ -387,7 +390,7 @@ PARALLEL_FOR_LOOP
sU=ss+ ssoff;
for(int s=soff;s<soff+swork;s++){
sF = s+Ls*sU;
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
}
}
}
@ -398,7 +401,7 @@ PARALLEL_FOR_LOOP
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
}
}
} else {
@ -407,12 +410,13 @@ PARALLEL_FOR_LOOP
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out);
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
}
}
}
}
dslashtime +=usecond();
alltime+=usecond();
}
template<class Impl>
@ -421,7 +425,10 @@ void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, Le
const FermionField &in, FermionField &out,int dag)
{
// assert((dag==DaggerNo) ||(dag==DaggerYes));
alltime-=usecond();
int calls;
int updates;
Compressor compressor(dag);
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
@ -432,7 +439,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, Le
int nwork = U._grid->oSites();
commtime -=usecond();
std::thread thr = st.HaloExchangeBegin(in,comm_buf,compressor);
auto handle = st.HaloExchangeBegin(in,compressor);
commtime +=usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
@ -450,7 +457,7 @@ PARALLEL_FOR_LOOP
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
} else {
@ -461,7 +468,7 @@ PARALLEL_FOR_LOOP
for(sd=0;sd<Ls;sd++){
int sU=ss;
int sF = sd+Ls*sU;
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
}
@ -473,7 +480,7 @@ PARALLEL_FOR_LOOP
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
} else {
@ -482,7 +489,7 @@ PARALLEL_FOR_LOOP
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
}
@ -490,12 +497,12 @@ PARALLEL_FOR_LOOP
dslashtime +=usecond();
jointime -=usecond();
thr.join();
st.HaloExchangeComplete(handle);
jointime +=usecond();
local = false;
nonlocal = true;
dslashtime -=usecond();
dslash1time -=usecond();
if ( dag == DaggerYes ) {
if( this->HandOptDslash ) {
PARALLEL_FOR_LOOP
@ -503,7 +510,7 @@ PARALLEL_FOR_LOOP
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
} else {
@ -514,7 +521,7 @@ PARALLEL_FOR_LOOP
for(sd=0;sd<Ls;sd++){
int sU=ss;
int sF = sd+Ls*sU;
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
}
@ -526,7 +533,7 @@ PARALLEL_FOR_LOOP
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
} else {
@ -535,13 +542,13 @@ PARALLEL_FOR_LOOP
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
}
}
dslashtime +=usecond();
dslash1time +=usecond();
alltime+=usecond();
}

View File

@ -61,9 +61,11 @@ namespace Grid {
public:
INHERIT_IMPL_TYPES(Impl);
typedef WilsonKernels<Impl> Kernels;
double alltime;
double jointime;
double commtime;
double dslashtime;
double dslash1time;
///////////////////////////////////////////////////////////////
// Implement the abstract base
///////////////////////////////////////////////////////////////

View File

@ -60,15 +60,10 @@ namespace Grid {
void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
#if 0
<<<<<<< HEAD
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
// int sF,int sU,const FermionField &in, FermionField &out,uint64_t *);
#if 0
void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out,uint64_t *p){
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
}
#endif
// doesn't seem to work with Gparity at the moment
#undef HANDOPT
#if 1
@ -79,7 +74,18 @@ namespace Grid {
void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
// int sF,int sU,const FermionField &in, FermionField &out);
#endif
#else
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
int DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
int DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
//>>>>>>> fc6ad657514c7966291c19f22af89de5d5a96f93
#endif
WilsonKernels(const ImplParams &p= ImplParams());

View File

@ -310,7 +310,7 @@ namespace QCD {
template<class Impl>
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
{
@ -318,21 +318,21 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
REGISTER Simd result_00; // 12 regs on knc
REGISTER Simd result_01;
REGISTER Simd result_02;
REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
REGISTER Simd result_10;
REGISTER Simd result_11;
REGISTER Simd result_12;
REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
REGISTER Simd result_20;
REGISTER Simd result_21;
REGISTER Simd result_22;
REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
REGISTER Simd result_30;
REGISTER Simd result_31;
REGISTER Simd result_32; // 20 left
REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
REGISTER Simd Chi_00; // two spinor; 6 regs
REGISTER Simd Chi_01;
@ -372,172 +372,178 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
StencilEntry *SE;
int offset,local,perm, ptype;
int offset, ptype;
int num = 0;
// Xp
SE=st.GetEntry(ptype,Xp,ss);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
if (Local && SE->_is_local ) {
LOAD_CHIMU;
XP_PROJ;
if ( perm) {
if ( SE->_permute ) {
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
{
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Xp);
XP_RECON_ACCUM;
num++;
}
XP_RECON;
// Yp
SE=st.GetEntry(ptype,Yp,ss);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
if (Local && SE->_is_local ) {
LOAD_CHIMU;
YP_PROJ;
if ( perm) {
if ( SE->_permute ) {
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
{
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Yp);
YP_RECON_ACCUM;
num++;
}
YP_RECON_ACCUM;
// Zp
SE=st.GetEntry(ptype,Zp,ss);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
if (Local && SE->_is_local ) {
LOAD_CHIMU;
ZP_PROJ;
if ( perm) {
if ( SE->_permute ) {
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
{
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Zp);
ZP_RECON_ACCUM;
num++;
}
ZP_RECON_ACCUM;
// Tp
SE=st.GetEntry(ptype,Tp,ss);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
if (Local && SE->_is_local ) {
LOAD_CHIMU;
TP_PROJ;
if ( perm) {
if ( SE->_permute ) {
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
{
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Tp);
TP_RECON_ACCUM;
num++;
}
TP_RECON_ACCUM;
// Xm
SE=st.GetEntry(ptype,Xm,ss);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
if (Local && SE->_is_local ) {
LOAD_CHIMU;
XM_PROJ;
if ( perm) {
if ( SE->_permute ) {
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
{
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Xm);
XM_RECON_ACCUM;
num++;
}
XM_RECON_ACCUM;
// Ym
SE=st.GetEntry(ptype,Ym,ss);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
if (Local && SE->_is_local ) {
LOAD_CHIMU;
YM_PROJ;
if ( perm) {
if ( SE->_permute ) {
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
{
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Ym);
YM_RECON_ACCUM;
num++;
}
YM_RECON_ACCUM;
// Zm
SE=st.GetEntry(ptype,Zm,ss);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
if (Local && SE->_is_local ) {
LOAD_CHIMU;
ZM_PROJ;
if ( perm) {
if ( SE->_permute ) {
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
{
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Zm);
ZM_RECON_ACCUM;
num++;
}
ZM_RECON_ACCUM;
// Tm
SE=st.GetEntry(ptype,Tm,ss);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
if (Local && SE->_is_local ) {
LOAD_CHIMU;
TM_PROJ;
if ( perm) {
if ( SE->_permute ) {
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
{
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Tm);
TM_RECON_ACCUM;
num++;
}
TM_RECON_ACCUM;
{
SiteSpinor & ref (out._odata[ss]);
SiteSpinor & ref (out._odata[ss]);
if ( Local ) {
vstream(ref()(0)(0),result_00*(-0.5));
vstream(ref()(0)(1),result_01*(-0.5));
vstream(ref()(0)(2),result_02*(-0.5));
@ -550,9 +556,295 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
vstream(ref()(3)(0),result_30*(-0.5));
vstream(ref()(3)(1),result_31*(-0.5));
vstream(ref()(3)(2),result_32*(-0.5));
return 1;
} else if ( num ) {
vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
vstream(ref()(0)(2),ref()(0)(2)+result_02*(-0.5));
vstream(ref()(1)(0),ref()(1)(0)+result_10*(-0.5));
vstream(ref()(1)(1),ref()(1)(1)+result_11*(-0.5));
vstream(ref()(1)(2),ref()(1)(2)+result_12*(-0.5));
vstream(ref()(2)(0),ref()(2)(0)+result_20*(-0.5));
vstream(ref()(2)(1),ref()(2)(1)+result_21*(-0.5));
vstream(ref()(2)(2),ref()(2)(2)+result_22*(-0.5));
vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
return 1;
}
return 0;
}
template<class Impl>
int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
{
// std::cout << "Hand op Dhop "<<std::endl;
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
REGISTER Simd Chi_00; // two spinor; 6 regs
REGISTER Simd Chi_01;
REGISTER Simd Chi_02;
REGISTER Simd Chi_10;
REGISTER Simd Chi_11;
REGISTER Simd Chi_12; // 14 left
REGISTER Simd UChi_00; // two spinor; 6 regs
REGISTER Simd UChi_01;
REGISTER Simd UChi_02;
REGISTER Simd UChi_10;
REGISTER Simd UChi_11;
REGISTER Simd UChi_12; // 8 left
REGISTER Simd U_00; // two rows of U matrix
REGISTER Simd U_10;
REGISTER Simd U_20;
REGISTER Simd U_01;
REGISTER Simd U_11;
REGISTER Simd U_21; // 2 reg left.
#define Chimu_00 Chi_00
#define Chimu_01 Chi_01
#define Chimu_02 Chi_02
#define Chimu_10 Chi_10
#define Chimu_11 Chi_11
#define Chimu_12 Chi_12
#define Chimu_20 UChi_00
#define Chimu_21 UChi_01
#define Chimu_22 UChi_02
#define Chimu_30 UChi_10
#define Chimu_31 UChi_11
#define Chimu_32 UChi_12
StencilEntry *SE;
int offset, ptype;
int num = 0;
// Xp
SE=st.GetEntry(ptype,Xp,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
XM_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Xp);
XM_RECON_ACCUM;
num++;
}
// Yp
SE=st.GetEntry(ptype,Yp,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
YM_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Yp);
YM_RECON_ACCUM;
num++;
}
// Zp
SE=st.GetEntry(ptype,Zp,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
ZM_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Zp);
ZM_RECON_ACCUM;
num++;
}
// Tp
SE=st.GetEntry(ptype,Tp,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
TM_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Tp);
TM_RECON_ACCUM;
num++;
}
// Xm
SE=st.GetEntry(ptype,Xm,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
XP_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Xm);
XP_RECON_ACCUM;
num++;
}
// Ym
SE=st.GetEntry(ptype,Ym,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
YP_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Ym);
YP_RECON_ACCUM;
num++;
}
// Zm
SE=st.GetEntry(ptype,Zm,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
ZP_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Zm);
ZP_RECON_ACCUM;
num++;
}
// Tm
SE=st.GetEntry(ptype,Tm,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
TP_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Tm);
TP_RECON_ACCUM;
num++;
}
SiteSpinor & ref (out._odata[ss]);
if ( Local ) {
vstream(ref()(0)(0),result_00*(-0.5));
vstream(ref()(0)(1),result_01*(-0.5));
vstream(ref()(0)(2),result_02*(-0.5));
vstream(ref()(1)(0),result_10*(-0.5));
vstream(ref()(1)(1),result_11*(-0.5));
vstream(ref()(1)(2),result_12*(-0.5));
vstream(ref()(2)(0),result_20*(-0.5));
vstream(ref()(2)(1),result_21*(-0.5));
vstream(ref()(2)(2),result_22*(-0.5));
vstream(ref()(3)(0),result_30*(-0.5));
vstream(ref()(3)(1),result_31*(-0.5));
vstream(ref()(3)(2),result_32*(-0.5));
return 1;
} else if ( num ) {
vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
vstream(ref()(0)(2),ref()(0)(2)+result_02*(-0.5));
vstream(ref()(1)(0),ref()(1)(0)+result_10*(-0.5));
vstream(ref()(1)(1),ref()(1)(1)+result_11*(-0.5));
vstream(ref()(1)(2),ref()(1)(2)+result_12*(-0.5));
vstream(ref()(2)(0),ref()(2)(0)+result_20*(-0.5));
vstream(ref()(2)(1),ref()(2)(1)+result_21*(-0.5));
vstream(ref()(2)(2),ref()(2)(2)+result_22*(-0.5));
vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
return 1;
}
return 0;
}
/*
template<class Impl>
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
@ -795,7 +1087,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
vstream(ref()(3)(2),result_32*(-0.5));
}
}
*/
////////////////////////////////////////////////
// Specialise Gparity to simple implementation
////////////////////////////////////////////////