mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
Staggered overlap comms comput
This commit is contained in:
parent
e55397bc13
commit
24162c9ead
@ -124,6 +124,15 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,
|
|||||||
|
|
||||||
// Allocate the required comms buffer
|
// Allocate the required comms buffer
|
||||||
ImportGauge(_Uthin,_Ufat);
|
ImportGauge(_Uthin,_Ufat);
|
||||||
|
|
||||||
|
int LLs = FiveDimGrid._rdimensions[0];
|
||||||
|
int vol4= FourDimGrid.oSites();
|
||||||
|
Stencil.BuildSurfaceList(LLs,vol4);
|
||||||
|
|
||||||
|
vol4=FourDimRedBlackGrid.oSites();
|
||||||
|
StencilEven.BuildSurfaceList(LLs,vol4);
|
||||||
|
StencilOdd.BuildSurfaceList(LLs,vol4);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -223,6 +232,157 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
|||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*CHANGE */
|
||||||
|
template<class Impl>
|
||||||
|
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
|
const FermionField &in, FermionField &out,int dag)
|
||||||
|
{
|
||||||
|
DhopTotalTime-=usecond();
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||||
|
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
||||||
|
DhopTotalTime+=usecond();
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
|
const FermionField &in, FermionField &out,int dag)
|
||||||
|
{
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||||
|
|
||||||
|
Compressor compressor;
|
||||||
|
|
||||||
|
int LLs = in._grid->_rdimensions[0];
|
||||||
|
int len = U._grid->oSites();
|
||||||
|
|
||||||
|
DhopFaceTime-=usecond();
|
||||||
|
st.Prepare();
|
||||||
|
st.HaloGather(in,compressor);
|
||||||
|
// st.HaloExchangeOptGather(in,compressor); // Wilson compressor
|
||||||
|
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||||
|
DhopFaceTime+=usecond();
|
||||||
|
|
||||||
|
double ctime=0;
|
||||||
|
double ptime=0;
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Ugly explicit thread mapping introduced for OPA reasons.
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
|
||||||
|
{
|
||||||
|
int tid = omp_get_thread_num();
|
||||||
|
int nthreads = omp_get_num_threads();
|
||||||
|
int ncomms = CartesianCommunicator::nCommThreads;
|
||||||
|
if (ncomms == -1) ncomms = 1;
|
||||||
|
assert(nthreads > ncomms);
|
||||||
|
if (tid >= ncomms) {
|
||||||
|
double start = usecond();
|
||||||
|
nthreads -= ncomms;
|
||||||
|
int ttid = tid - ncomms;
|
||||||
|
int n = U._grid->oSites(); // 4d vol
|
||||||
|
int chunk = n / nthreads;
|
||||||
|
int rem = n % nthreads;
|
||||||
|
int myblock, myn;
|
||||||
|
if (ttid < rem) {
|
||||||
|
myblock = ttid * chunk + ttid;
|
||||||
|
myn = chunk+1;
|
||||||
|
} else {
|
||||||
|
myblock = ttid*chunk + rem;
|
||||||
|
myn = chunk;
|
||||||
|
}
|
||||||
|
|
||||||
|
// do the compute
|
||||||
|
if (dag == DaggerYes) {
|
||||||
|
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||||
|
int sU = ss;
|
||||||
|
// Interior = 1; Exterior = 0; must implement for staggered
|
||||||
|
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<---------
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||||
|
// Interior = 1; Exterior = 0;
|
||||||
|
int sU = ss;
|
||||||
|
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<------------
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ptime = usecond() - start;
|
||||||
|
} else {
|
||||||
|
double start = usecond();
|
||||||
|
st.CommunicateThreaded();
|
||||||
|
ctime = usecond() - start;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DhopCommTime += ctime;
|
||||||
|
DhopComputeTime+=ptime;
|
||||||
|
|
||||||
|
// First to enter, last to leave timing
|
||||||
|
st.CollateThreads();
|
||||||
|
|
||||||
|
DhopFaceTime-=usecond();
|
||||||
|
st.CommsMerge(compressor);
|
||||||
|
DhopFaceTime+=usecond();
|
||||||
|
|
||||||
|
DhopComputeTime2-=usecond();
|
||||||
|
if (dag == DaggerYes) {
|
||||||
|
int sz=st.surface_list.size();
|
||||||
|
parallel_for (int ss = 0; ss < sz; ss++) {
|
||||||
|
int sU = st.surface_list[ss];
|
||||||
|
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1); //<----------
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
int sz=st.surface_list.size();
|
||||||
|
parallel_for (int ss = 0; ss < sz; ss++) {
|
||||||
|
int sU = st.surface_list[ss];
|
||||||
|
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1);//<----------
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DhopComputeTime2+=usecond();
|
||||||
|
#else
|
||||||
|
assert(0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
|
const FermionField &in, FermionField &out,int dag)
|
||||||
|
{
|
||||||
|
Compressor compressor;
|
||||||
|
int LLs = in._grid->_rdimensions[0];
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
DhopTotalTime -= usecond();
|
||||||
|
DhopCommTime -= usecond();
|
||||||
|
st.HaloExchange(in,compressor);
|
||||||
|
DhopCommTime += usecond();
|
||||||
|
|
||||||
|
DhopComputeTime -= usecond();
|
||||||
|
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||||
|
if (dag == DaggerYes) {
|
||||||
|
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
||||||
|
int sU=ss;
|
||||||
|
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
||||||
|
int sU=ss;
|
||||||
|
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DhopComputeTime += usecond();
|
||||||
|
DhopTotalTime += usecond();
|
||||||
|
}
|
||||||
|
/*CHANGE END*/
|
||||||
|
|
||||||
|
/* ORG
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
@ -254,6 +414,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
|
|||||||
DhopComputeTime += usecond();
|
DhopComputeTime += usecond();
|
||||||
DhopTotalTime += usecond();
|
DhopTotalTime += usecond();
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -336,6 +497,9 @@ void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)
|
|||||||
DhopTotalTime = 0;
|
DhopTotalTime = 0;
|
||||||
DhopCommTime = 0;
|
DhopCommTime = 0;
|
||||||
DhopComputeTime = 0;
|
DhopComputeTime = 0;
|
||||||
|
DhopFaceTime = 0;
|
||||||
|
|
||||||
|
|
||||||
Stencil.ZeroCounters();
|
Stencil.ZeroCounters();
|
||||||
StencilEven.ZeroCounters();
|
StencilEven.ZeroCounters();
|
||||||
StencilOdd.ZeroCounters();
|
StencilOdd.ZeroCounters();
|
||||||
|
@ -64,6 +64,8 @@ namespace QCD {
|
|||||||
double DhopCalls;
|
double DhopCalls;
|
||||||
double DhopCommTime;
|
double DhopCommTime;
|
||||||
double DhopComputeTime;
|
double DhopComputeTime;
|
||||||
|
double DhopComputeTime2;
|
||||||
|
double DhopFaceTime;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Implement the abstract base
|
// Implement the abstract base
|
||||||
@ -119,6 +121,23 @@ namespace QCD {
|
|||||||
FermionField &out,
|
FermionField &out,
|
||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
|
void DhopInternalOverlappedComms(StencilImpl & st,
|
||||||
|
LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
DoubledGaugeField &UUU,
|
||||||
|
const FermionField &in,
|
||||||
|
FermionField &out,
|
||||||
|
int dag);
|
||||||
|
|
||||||
|
void DhopInternalSerialComms(StencilImpl & st,
|
||||||
|
LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
DoubledGaugeField &UUU,
|
||||||
|
const FermionField &in,
|
||||||
|
FermionField &out,
|
||||||
|
int dag);
|
||||||
|
|
||||||
|
|
||||||
// Constructors
|
// Constructors
|
||||||
ImprovedStaggeredFermion5D(GaugeField &_Uthin,
|
ImprovedStaggeredFermion5D(GaugeField &_Uthin,
|
||||||
GaugeField &_Ufat,
|
GaugeField &_Ufat,
|
||||||
|
@ -32,223 +32,242 @@ namespace Grid {
|
|||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
|
int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
|
||||||
|
int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
|
||||||
|
|
||||||
|
#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \
|
||||||
|
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
||||||
|
if (SE->_is_local ) { \
|
||||||
|
if (SE->_permute) { \
|
||||||
|
chi_p = χ \
|
||||||
|
permute(chi, in._odata[SE->_offset], ptype); \
|
||||||
|
} else { \
|
||||||
|
chi_p = &in._odata[SE->_offset]; \
|
||||||
|
} \
|
||||||
|
} else { \
|
||||||
|
chi_p = &buf[SE->_offset]; \
|
||||||
|
} \
|
||||||
|
multLink(Uchi, U._odata[sU], *chi_p, Dir);
|
||||||
|
|
||||||
|
#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \
|
||||||
|
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
||||||
|
if (SE->_is_local ) { \
|
||||||
|
if (SE->_permute) { \
|
||||||
|
chi_p = χ \
|
||||||
|
permute(chi, in._odata[SE->_offset], ptype); \
|
||||||
|
} else { \
|
||||||
|
chi_p = &in._odata[SE->_offset]; \
|
||||||
|
} \
|
||||||
|
} else if ( st.same_node[Dir] ) { \
|
||||||
|
chi_p = &buf[SE->_offset]; \
|
||||||
|
} \
|
||||||
|
if (SE->_is_local || st.same_node[Dir] ) { \
|
||||||
|
multLink(Uchi, U._odata[sU], *chi_p, Dir); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \
|
||||||
|
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
||||||
|
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
|
||||||
|
nmu++; \
|
||||||
|
chi_p = &buf[SE->_offset]; \
|
||||||
|
multLink(Uchi, U._odata[sU], *chi_p, Dir); \
|
||||||
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
|
StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
|
||||||
|
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Generic implementation; move to different file?
|
// Generic implementation; move to different file?
|
||||||
////////////////////////////////////////////
|
// Int, Ext, Int+Ext cases for comms overlap
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
||||||
SiteSpinor *buf, int sF,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
int sU, const FermionField &in, SiteSpinor &out,int threeLink) {
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out, int dag) {
|
||||||
const SiteSpinor *chi_p;
|
const SiteSpinor *chi_p;
|
||||||
SiteSpinor chi;
|
SiteSpinor chi;
|
||||||
SiteSpinor Uchi;
|
SiteSpinor Uchi;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int ptype;
|
int ptype;
|
||||||
int skew = 0;
|
int skew;
|
||||||
if (threeLink) skew=8;
|
|
||||||
///////////////////////////
|
|
||||||
// Xp
|
|
||||||
///////////////////////////
|
|
||||||
|
|
||||||
SE = st.GetEntry(ptype, Xp+skew, sF);
|
for(int s=0;s<LLs;s++){
|
||||||
if (SE->_is_local) {
|
int sF=LLs*sU+s;
|
||||||
if (SE->_permute) {
|
skew = 0;
|
||||||
chi_p = χ
|
GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
|
||||||
} else {
|
GENERIC_STENCIL_LEG(U,Zp,skew,Impl::multLinkAdd);
|
||||||
chi_p = &in._odata[SE->_offset];
|
GENERIC_STENCIL_LEG(U,Tp,skew,Impl::multLinkAdd);
|
||||||
}
|
GENERIC_STENCIL_LEG(U,Xm,skew,Impl::multLinkAdd);
|
||||||
} else {
|
GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
|
||||||
chi_p = &buf[SE->_offset];
|
GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
|
||||||
|
skew=8;
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Zp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Tp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Xm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
|
||||||
|
if ( dag ) {
|
||||||
|
Uchi = - Uchi;
|
||||||
|
}
|
||||||
|
vstream(out._odata[sF], Uchi);
|
||||||
}
|
}
|
||||||
Impl::multLink(Uchi, U._odata[sU], *chi_p, Xp);
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// Yp
|
|
||||||
///////////////////////////
|
|
||||||
SE = st.GetEntry(ptype, Yp+skew, sF);
|
|
||||||
if (SE->_is_local) {
|
|
||||||
if (SE->_permute) {
|
|
||||||
chi_p = χ
|
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
|
||||||
} else {
|
|
||||||
chi_p = &in._odata[SE->_offset];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
chi_p = &buf[SE->_offset];
|
|
||||||
}
|
|
||||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Yp);
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// Zp
|
|
||||||
///////////////////////////
|
|
||||||
SE = st.GetEntry(ptype, Zp+skew, sF);
|
|
||||||
if (SE->_is_local) {
|
|
||||||
if (SE->_permute) {
|
|
||||||
chi_p = χ
|
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
|
||||||
} else {
|
|
||||||
chi_p = &in._odata[SE->_offset];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
chi_p = &buf[SE->_offset];
|
|
||||||
}
|
|
||||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zp);
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// Tp
|
|
||||||
///////////////////////////
|
|
||||||
SE = st.GetEntry(ptype, Tp+skew, sF);
|
|
||||||
if (SE->_is_local) {
|
|
||||||
if (SE->_permute) {
|
|
||||||
chi_p = χ
|
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
|
||||||
} else {
|
|
||||||
chi_p = &in._odata[SE->_offset];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
chi_p = &buf[SE->_offset];
|
|
||||||
}
|
|
||||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tp);
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// Xm
|
|
||||||
///////////////////////////
|
|
||||||
SE = st.GetEntry(ptype, Xm+skew, sF);
|
|
||||||
if (SE->_is_local) {
|
|
||||||
if (SE->_permute) {
|
|
||||||
chi_p = χ
|
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
|
||||||
} else {
|
|
||||||
chi_p = &in._odata[SE->_offset];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
chi_p = &buf[SE->_offset];
|
|
||||||
}
|
|
||||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Xm);
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// Ym
|
|
||||||
///////////////////////////
|
|
||||||
SE = st.GetEntry(ptype, Ym+skew, sF);
|
|
||||||
if (SE->_is_local) {
|
|
||||||
if (SE->_permute) {
|
|
||||||
chi_p = χ
|
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
|
||||||
} else {
|
|
||||||
chi_p = &in._odata[SE->_offset];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
chi_p = &buf[SE->_offset];
|
|
||||||
}
|
|
||||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Ym);
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// Zm
|
|
||||||
///////////////////////////
|
|
||||||
SE = st.GetEntry(ptype, Zm+skew, sF);
|
|
||||||
if (SE->_is_local) {
|
|
||||||
if (SE->_permute) {
|
|
||||||
chi_p = χ
|
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
|
||||||
} else {
|
|
||||||
chi_p = &in._odata[SE->_offset];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
chi_p = &buf[SE->_offset];
|
|
||||||
}
|
|
||||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zm);
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// Tm
|
|
||||||
///////////////////////////
|
|
||||||
SE = st.GetEntry(ptype, Tm+skew, sF);
|
|
||||||
if (SE->_is_local) {
|
|
||||||
if (SE->_permute) {
|
|
||||||
chi_p = χ
|
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
|
||||||
} else {
|
|
||||||
chi_p = &in._odata[SE->_offset];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
chi_p = &buf[SE->_offset];
|
|
||||||
}
|
|
||||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tm);
|
|
||||||
|
|
||||||
vstream(out, Uchi);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////
|
||||||
|
// Only contributions from interior of our node
|
||||||
|
///////////////////////////////////////////////////
|
||||||
|
template <class Impl>
|
||||||
|
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag) {
|
||||||
|
const SiteSpinor *chi_p;
|
||||||
|
SiteSpinor chi;
|
||||||
|
SiteSpinor Uchi;
|
||||||
|
StencilEntry *SE;
|
||||||
|
int ptype;
|
||||||
|
int skew ;
|
||||||
|
|
||||||
|
for(int s=0;s<LLs;s++){
|
||||||
|
int sF=LLs*sU+s;
|
||||||
|
skew = 0;
|
||||||
|
Uchi=zero;
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Yp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Zp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Tp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Xm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
|
||||||
|
skew=8;
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Zp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Tp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Xm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
|
||||||
|
if ( dag ) {
|
||||||
|
Uchi = - Uchi;
|
||||||
|
}
|
||||||
|
vstream(out._odata[sF], Uchi);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////
|
||||||
|
// Only contributions from exterior of our node
|
||||||
|
///////////////////////////////////////////////////
|
||||||
|
template <class Impl>
|
||||||
|
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag) {
|
||||||
|
const SiteSpinor *chi_p;
|
||||||
|
SiteSpinor chi;
|
||||||
|
SiteSpinor Uchi;
|
||||||
|
StencilEntry *SE;
|
||||||
|
int ptype;
|
||||||
|
int nmu=0;
|
||||||
|
int skew ;
|
||||||
|
|
||||||
|
for(int s=0;s<LLs;s++){
|
||||||
|
int sF=LLs*sU+s;
|
||||||
|
skew = 0;
|
||||||
|
Uchi=zero;
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Yp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Zp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Tp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Xm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
|
||||||
|
skew=8;
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Zp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Tp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Xm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
|
||||||
|
|
||||||
|
if ( nmu ) {
|
||||||
|
if ( dag ) {
|
||||||
|
out._odata[sF] = out._odata[sF] - Uchi;
|
||||||
|
} else {
|
||||||
|
out._odata[sF] = out._odata[sF] + Uchi;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Driving / wrapping routine to select right kernel
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
const FermionField &in, FermionField &out) {
|
const FermionField &in, FermionField &out,
|
||||||
SiteSpinor naik;
|
int interior,int exterior)
|
||||||
SiteSpinor naive;
|
{
|
||||||
int oneLink =0;
|
|
||||||
int threeLink=1;
|
|
||||||
int dag=1;
|
int dag=1;
|
||||||
switch(Opt) {
|
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
|
||||||
#ifdef AVX512
|
};
|
||||||
//FIXME; move the sign into the Asm routine
|
|
||||||
case OptInlineAsm:
|
template <class Impl>
|
||||||
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out);
|
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
for(int s=0;s<LLs;s++) {
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
int sF=s+LLs*sU;
|
const FermionField &in, FermionField &out,
|
||||||
out._odata[sF]=-out._odata[sF];
|
int interior,int exterior)
|
||||||
}
|
{
|
||||||
break;
|
int dag=0;
|
||||||
#endif
|
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
|
||||||
case OptHandUnroll:
|
|
||||||
DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
|
||||||
break;
|
|
||||||
case OptGeneric:
|
|
||||||
for(int s=0;s<LLs;s++){
|
|
||||||
int sF=s+LLs*sU;
|
|
||||||
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
|
|
||||||
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
|
|
||||||
out._odata[sF] =-naive-naik;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
std::cout<<"Oops Opt = "<<Opt<<std::endl;
|
|
||||||
assert(0);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
SiteSpinor *buf, int LLs,
|
SiteSpinor *buf, int LLs,
|
||||||
int sU, const FermionField &in, FermionField &out)
|
int sU, const FermionField &in, FermionField &out,
|
||||||
|
int dag,int interior,int exterior)
|
||||||
{
|
{
|
||||||
int oneLink =0;
|
|
||||||
int threeLink=1;
|
|
||||||
SiteSpinor naik;
|
|
||||||
SiteSpinor naive;
|
|
||||||
int dag=0;
|
|
||||||
switch(Opt) {
|
switch(Opt) {
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
case OptInlineAsm:
|
case OptInlineAsm:
|
||||||
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out);
|
if ( interior && exterior ) {
|
||||||
|
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
|
} else if ( interior ) {
|
||||||
|
DhopSiteAsmInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
|
} else if ( exterior ) {
|
||||||
|
DhopSiteAsmExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
#endif
|
#endif
|
||||||
case OptHandUnroll:
|
case OptHandUnroll:
|
||||||
DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
if ( interior && exterior ) {
|
||||||
|
DhopSiteHand (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
|
} else if ( interior ) {
|
||||||
|
DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
|
} else if ( exterior ) {
|
||||||
|
DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case OptGeneric:
|
case OptGeneric:
|
||||||
for(int s=0;s<LLs;s++){
|
if ( interior && exterior ) {
|
||||||
int sF=LLs*sU+s;
|
DhopSiteGeneric (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
// assert(sF<in._odata.size());
|
} else if ( interior ) {
|
||||||
// assert(sU< U._odata.size());
|
DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
// assert(sF>=0); assert(sU>=0);
|
} else if ( exterior ) {
|
||||||
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
|
DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
|
|
||||||
out._odata[sF] =naive+naik;
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
@ -38,8 +38,9 @@ namespace QCD {
|
|||||||
class StaggeredKernelsStatic {
|
class StaggeredKernelsStatic {
|
||||||
public:
|
public:
|
||||||
enum { OptGeneric, OptHandUnroll, OptInlineAsm };
|
enum { OptGeneric, OptHandUnroll, OptInlineAsm };
|
||||||
// S-direction is INNERMOST and takes no part in the parity.
|
enum { CommsAndCompute, CommsThenCompute };
|
||||||
static int Opt; // these are a temporary hack
|
static int Opt;
|
||||||
|
static int Comms;
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , public StaggeredKernelsStatic {
|
template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , public StaggeredKernelsStatic {
|
||||||
@ -53,24 +54,70 @@ public:
|
|||||||
void DhopDir(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
|
void DhopDir(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
|
||||||
int sF, int sU, const FermionField &in, FermionField &out, int dir,int disp);
|
int sF, int sU, const FermionField &in, FermionField &out, int dir,int disp);
|
||||||
|
|
||||||
void DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf,
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
int sF, int sU, const FermionField &in, SiteSpinor &out,int threeLink);
|
// Generic Nc kernels
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag);
|
||||||
|
void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag);
|
||||||
|
void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Nc=3 specific kernels
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag);
|
||||||
|
void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag);
|
||||||
|
void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag);
|
||||||
|
|
||||||
void DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf,
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
int sF, int sU, const FermionField &in, SiteSpinor&out,int threeLink);
|
// Asm Nc=3 specific kernels
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag);
|
||||||
|
void DhopSiteAsmInt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag);
|
||||||
|
void DhopSiteAsmExt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag);
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Generic interface; fan out to right routine
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out, int interior=1,int exterior=1);
|
||||||
|
|
||||||
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,SiteSpinor * buf,
|
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo,
|
||||||
int LLs, int sU, const FermionField &in, FermionField &out, int dag);
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out, int interior=1,int exterior=1);
|
||||||
|
|
||||||
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, SiteSpinor * buf,
|
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
|
||||||
int LLs, int sU, const FermionField &in, FermionField &out);
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
void DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
|
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
||||||
int sF, int sU, const FermionField &in, FermionField &out);
|
|
||||||
|
|
||||||
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor *buf,
|
|
||||||
int LLs, int sU, const FermionField &in, FermionField &out);
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
@ -560,16 +560,53 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VSTORE(2,%0,pUChi_02) \
|
VSTORE(2,%0,pUChi_02) \
|
||||||
: : "r" (out) : "memory" );
|
: : "r" (out) : "memory" );
|
||||||
|
|
||||||
|
#define nREDUCE(out) \
|
||||||
|
asm ( \
|
||||||
|
VADD(UChi_00,UChi_10,UChi_00) \
|
||||||
|
VADD(UChi_01,UChi_11,UChi_01) \
|
||||||
|
VADD(UChi_02,UChi_12,UChi_02) \
|
||||||
|
VADD(UChi_30,UChi_20,UChi_30) \
|
||||||
|
VADD(UChi_31,UChi_21,UChi_31) \
|
||||||
|
VADD(UChi_32,UChi_22,UChi_32) \
|
||||||
|
VADD(UChi_00,UChi_30,UChi_00) \
|
||||||
|
VADD(UChi_01,UChi_31,UChi_01) \
|
||||||
|
VADD(UChi_02,UChi_32,UChi_02) ); \
|
||||||
|
asm (VZERO(Chi_00) \
|
||||||
|
VSUB(UChi_00,Chi_00,UChi_00) \
|
||||||
|
VSUB(UChi_01,Chi_00,UChi_01) \
|
||||||
|
VSUB(UChi_02,Chi_00,UChi_02) ); \
|
||||||
|
asm ( \
|
||||||
|
VSTORE(0,%0,pUChi_00) \
|
||||||
|
VSTORE(1,%0,pUChi_01) \
|
||||||
|
VSTORE(2,%0,pUChi_02) \
|
||||||
|
: : "r" (out) : "memory" );
|
||||||
|
|
||||||
#define REDUCEa(out) \
|
#define REDUCEa(out) \
|
||||||
asm ( \
|
asm ( \
|
||||||
VADD(UChi_00,UChi_10,UChi_00) \
|
VADD(UChi_00,UChi_10,UChi_00) \
|
||||||
VADD(UChi_01,UChi_11,UChi_01) \
|
VADD(UChi_01,UChi_11,UChi_01) \
|
||||||
VADD(UChi_02,UChi_12,UChi_02) ); \
|
VADD(UChi_02,UChi_12,UChi_02) ); \
|
||||||
|
asm ( \
|
||||||
|
VSTORE(0,%0,pUChi_00) \
|
||||||
|
VSTORE(1,%0,pUChi_01) \
|
||||||
|
VSTORE(2,%0,pUChi_02) \
|
||||||
|
: : "r" (out) : "memory" );
|
||||||
|
|
||||||
|
// FIXME is sign right in the VSUB ?
|
||||||
|
#define nREDUCEa(out) \
|
||||||
asm ( \
|
asm ( \
|
||||||
VSTORE(0,%0,pUChi_00) \
|
VADD(UChi_00,UChi_10,UChi_00) \
|
||||||
VSTORE(1,%0,pUChi_01) \
|
VADD(UChi_01,UChi_11,UChi_01) \
|
||||||
VSTORE(2,%0,pUChi_02) \
|
VADD(UChi_02,UChi_12,UChi_02) ); \
|
||||||
: : "r" (out) : "memory" );
|
asm (VZERO(Chi_00) \
|
||||||
|
VSUB(UChi_00,Chi_00,UChi_00) \
|
||||||
|
VSUB(UChi_01,Chi_00,UChi_01) \
|
||||||
|
VSUB(UChi_02,Chi_00,UChi_02) ); \
|
||||||
|
asm ( \
|
||||||
|
VSTORE(0,%0,pUChi_00) \
|
||||||
|
VSTORE(1,%0,pUChi_01) \
|
||||||
|
VSTORE(2,%0,pUChi_02) \
|
||||||
|
: : "r" (out) : "memory" );
|
||||||
|
|
||||||
#define PERMUTE_DIR(dir) \
|
#define PERMUTE_DIR(dir) \
|
||||||
permute##dir(Chi_0,Chi_0);\
|
permute##dir(Chi_0,Chi_0);\
|
||||||
@ -581,10 +618,9 @@ namespace QCD {
|
|||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
DoubledGaugeField &UUU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
SiteSpinor *buf, int LLs,
|
const FermionField &in, FermionField &out,int dag)
|
||||||
int sU, const FermionField &in, FermionField &out)
|
|
||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
};
|
};
|
||||||
@ -645,10 +681,9 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
// This is the single precision 5th direction vectorised kernel
|
// This is the single precision 5th direction vectorised kernel
|
||||||
#include <simd/Intel512single.h>
|
#include <simd/Intel512single.h>
|
||||||
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
DoubledGaugeField &UUU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
SiteSpinor *buf, int LLs,
|
const FermionField &in, FermionField &out,int dag)
|
||||||
int sU, const FermionField &in, FermionField &out)
|
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||||
@ -685,7 +720,11 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
|
|||||||
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
|
|
||||||
addr0 = (uint64_t) &out._odata[sF];
|
addr0 = (uint64_t) &out._odata[sF];
|
||||||
REDUCE(addr0);
|
if ( dag ) {
|
||||||
|
nREDUCE(addr0);
|
||||||
|
} else {
|
||||||
|
REDUCE(addr0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -695,10 +734,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
|
|||||||
|
|
||||||
#include <simd/Intel512double.h>
|
#include <simd/Intel512double.h>
|
||||||
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
DoubledGaugeField &UUU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
SiteSpinor *buf, int LLs,
|
const FermionField &in, FermionField &out,int dag)
|
||||||
int sU, const FermionField &in, FermionField &out)
|
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||||
@ -734,7 +772,11 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
|
|||||||
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
|
|
||||||
addr0 = (uint64_t) &out._odata[sF];
|
addr0 = (uint64_t) &out._odata[sF];
|
||||||
REDUCE(addr0);
|
if ( dag ) {
|
||||||
|
nREDUCE(addr0);
|
||||||
|
} else {
|
||||||
|
REDUCE(addr0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -776,10 +818,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
|
|||||||
|
|
||||||
#include <simd/Intel512single.h>
|
#include <simd/Intel512single.h>
|
||||||
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
DoubledGaugeField &UUU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
SiteSpinor *buf, int LLs,
|
const FermionField &in, FermionField &out,int dag)
|
||||||
int sU, const FermionField &in, FermionField &out)
|
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||||
@ -832,7 +873,11 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
|
|||||||
MULT_ADD_XYZT(gauge2,gauge3);
|
MULT_ADD_XYZT(gauge2,gauge3);
|
||||||
|
|
||||||
addr0 = (uint64_t) &out._odata[sF];
|
addr0 = (uint64_t) &out._odata[sF];
|
||||||
REDUCEa(addr0);
|
if ( dag ) {
|
||||||
|
nREDUCEa(addr0);
|
||||||
|
} else {
|
||||||
|
REDUCEa(addr0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -841,10 +886,9 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
|
|||||||
|
|
||||||
#include <simd/Intel512double.h>
|
#include <simd/Intel512double.h>
|
||||||
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
DoubledGaugeField &UUU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
SiteSpinor *buf, int LLs,
|
const FermionField &in, FermionField &out,int dag)
|
||||||
int sU, const FermionField &in, FermionField &out)
|
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||||
@ -897,7 +941,11 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
|
|||||||
MULT_ADD_XYZT(gauge2,gauge3);
|
MULT_ADD_XYZT(gauge2,gauge3);
|
||||||
|
|
||||||
addr0 = (uint64_t) &out._odata[sF];
|
addr0 = (uint64_t) &out._odata[sF];
|
||||||
REDUCEa(addr0);
|
if ( dag ) {
|
||||||
|
nREDUCEa(addr0);
|
||||||
|
} else {
|
||||||
|
REDUCEa(addr0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -909,7 +957,7 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
|
|||||||
DoubledGaugeField &U, \
|
DoubledGaugeField &U, \
|
||||||
DoubledGaugeField &UUU, \
|
DoubledGaugeField &UUU, \
|
||||||
SiteSpinor *buf, int LLs, \
|
SiteSpinor *buf, int LLs, \
|
||||||
int sU, const FermionField &in, FermionField &out);
|
int sU, const FermionField &in, FermionField &out,int dag);
|
||||||
|
|
||||||
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD);
|
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD);
|
||||||
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF);
|
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF);
|
||||||
|
@ -28,7 +28,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#include <Grid.h>
|
#include <Grid.h>
|
||||||
|
|
||||||
#define REGISTER
|
|
||||||
|
|
||||||
#define LOAD_CHI(b) \
|
#define LOAD_CHI(b) \
|
||||||
const SiteSpinor & ref (b[offset]); \
|
const SiteSpinor & ref (b[offset]); \
|
||||||
@ -59,7 +58,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
UChi ## _1 += U_12*Chi_2;\
|
UChi ## _1 += U_12*Chi_2;\
|
||||||
UChi ## _2 += U_22*Chi_2;
|
UChi ## _2 += U_22*Chi_2;
|
||||||
|
|
||||||
#define MULT_ADD(A,UChi) \
|
#define MULT_ADD(U,A,UChi) \
|
||||||
auto & ref(U._odata[sU](A)); \
|
auto & ref(U._odata[sU](A)); \
|
||||||
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
||||||
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
||||||
@ -82,241 +81,319 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
|
|
||||||
#define PERMUTE_DIR(dir) \
|
#define PERMUTE_DIR(dir) \
|
||||||
permute##dir(Chi_0,Chi_0);\
|
permute##dir(Chi_0,Chi_0); \
|
||||||
permute##dir(Chi_1,Chi_1);\
|
permute##dir(Chi_1,Chi_1); \
|
||||||
permute##dir(Chi_2,Chi_2);
|
permute##dir(Chi_2,Chi_2);
|
||||||
|
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
|
||||||
|
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
||||||
|
offset = SE->_offset; \
|
||||||
|
local = SE->_is_local; \
|
||||||
|
perm = SE->_permute; \
|
||||||
|
if ( local ) { \
|
||||||
|
LOAD_CHI(in._odata); \
|
||||||
|
if ( perm) { \
|
||||||
|
PERMUTE_DIR(Perm); \
|
||||||
|
} \
|
||||||
|
} else { \
|
||||||
|
LOAD_CHI(buf); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \
|
||||||
|
HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
|
||||||
|
{ \
|
||||||
|
MULT(Dir,even); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEG(U,Dir,Perm,skew,even) \
|
||||||
|
HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
|
||||||
|
{ \
|
||||||
|
MULT_ADD(U,Dir,even); \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \
|
||||||
|
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
||||||
|
offset = SE->_offset; \
|
||||||
|
local = SE->_is_local; \
|
||||||
|
perm = SE->_permute; \
|
||||||
|
if ( local ) { \
|
||||||
|
LOAD_CHI(in._odata); \
|
||||||
|
if ( perm) { \
|
||||||
|
PERMUTE_DIR(Perm); \
|
||||||
|
} \
|
||||||
|
} else if ( st.same_node[Dir] ) { \
|
||||||
|
LOAD_CHI(buf); \
|
||||||
|
} \
|
||||||
|
if (SE->_is_local || st.same_node[Dir] ) { \
|
||||||
|
MULT_ADD(U,Dir,even); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEG_EXT(U,Dir,Perm,skew,even) \
|
||||||
|
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
||||||
|
offset = SE->_offset; \
|
||||||
|
local = SE->_is_local; \
|
||||||
|
perm = SE->_permute; \
|
||||||
|
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
|
||||||
|
nmu++; \
|
||||||
|
{ LOAD_CHI(buf); } \
|
||||||
|
{ MULT_ADD(U,Dir,even); } \
|
||||||
|
}
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
||||||
SiteSpinor *buf, int LLs,
|
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
int sU, const FermionField &in, FermionField &out, int dag)
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
{
|
const FermionField &in, FermionField &out,int dag)
|
||||||
SiteSpinor naik;
|
|
||||||
SiteSpinor naive;
|
|
||||||
int oneLink =0;
|
|
||||||
int threeLink=1;
|
|
||||||
int skew(0);
|
|
||||||
Real scale(1.0);
|
|
||||||
|
|
||||||
if(dag) scale = -1.0;
|
|
||||||
|
|
||||||
for(int s=0;s<LLs;s++){
|
|
||||||
int sF=s+LLs*sU;
|
|
||||||
DhopSiteDepthHand(st,lo,U,buf,sF,sU,in,naive,oneLink);
|
|
||||||
DhopSiteDepthHand(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
|
|
||||||
out._odata[sF] =scale*(naive+naik);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void StaggeredKernels<Impl>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
|
||||||
SiteSpinor *buf, int sF,
|
|
||||||
int sU, const FermionField &in, SiteSpinor &out,int threeLink)
|
|
||||||
{
|
{
|
||||||
typedef typename Simd::scalar_type S;
|
typedef typename Simd::scalar_type S;
|
||||||
typedef typename Simd::vector_type V;
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
REGISTER Simd even_0; // 12 regs on knc
|
Simd even_0; // 12 regs on knc
|
||||||
REGISTER Simd even_1;
|
Simd even_1;
|
||||||
REGISTER Simd even_2;
|
Simd even_2;
|
||||||
REGISTER Simd odd_0; // 12 regs on knc
|
Simd odd_0; // 12 regs on knc
|
||||||
REGISTER Simd odd_1;
|
Simd odd_1;
|
||||||
REGISTER Simd odd_2;
|
Simd odd_2;
|
||||||
|
|
||||||
REGISTER Simd Chi_0; // two spinor; 6 regs
|
Simd Chi_0; // two spinor; 6 regs
|
||||||
REGISTER Simd Chi_1;
|
Simd Chi_1;
|
||||||
REGISTER Simd Chi_2;
|
Simd Chi_2;
|
||||||
|
|
||||||
REGISTER Simd U_00; // two rows of U matrix
|
Simd U_00; // two rows of U matrix
|
||||||
REGISTER Simd U_10;
|
Simd U_10;
|
||||||
REGISTER Simd U_20;
|
Simd U_20;
|
||||||
REGISTER Simd U_01;
|
Simd U_01;
|
||||||
REGISTER Simd U_11;
|
Simd U_11;
|
||||||
REGISTER Simd U_21; // 2 reg left.
|
Simd U_21; // 2 reg left.
|
||||||
REGISTER Simd U_02;
|
Simd U_02;
|
||||||
REGISTER Simd U_12;
|
Simd U_12;
|
||||||
REGISTER Simd U_22;
|
Simd U_22;
|
||||||
|
|
||||||
int skew = 0;
|
|
||||||
if (threeLink) skew=8;
|
|
||||||
|
|
||||||
|
SiteSpinor result;
|
||||||
int offset,local,perm, ptype;
|
int offset,local,perm, ptype;
|
||||||
|
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
|
int skew;
|
||||||
|
|
||||||
// Xp
|
for(int s=0;s<LLs;s++){
|
||||||
SE=st.GetEntry(ptype,Xp+skew,sF);
|
int sF=s+LLs*sU;
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
LOAD_CHI(in._odata);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT(Xp,even);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Yp
|
|
||||||
SE=st.GetEntry(ptype,Yp+skew,sF);
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
LOAD_CHI(in._odata);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT(Yp,odd);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
skew = 0;
|
||||||
// Zp
|
HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);
|
||||||
SE=st.GetEntry(ptype,Zp+skew,sF);
|
HAND_STENCIL_LEG_BEGIN(Yp,2,skew,odd);
|
||||||
offset = SE->_offset;
|
HAND_STENCIL_LEG (U,Zp,1,skew,even);
|
||||||
local = SE->_is_local;
|
HAND_STENCIL_LEG (U,Tp,0,skew,odd);
|
||||||
perm = SE->_permute;
|
HAND_STENCIL_LEG (U,Xm,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG (U,Ym,2,skew,odd);
|
||||||
if ( local ) {
|
HAND_STENCIL_LEG (U,Zm,1,skew,even);
|
||||||
LOAD_CHI(in._odata);
|
HAND_STENCIL_LEG (U,Tm,0,skew,odd);
|
||||||
if ( perm) {
|
skew = 8;
|
||||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
HAND_STENCIL_LEG(UUU,Xp,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG(UUU,Zp,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG(UUU,Tp,0,skew,odd);
|
||||||
|
HAND_STENCIL_LEG(UUU,Xm,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG(UUU,Zm,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);
|
||||||
|
|
||||||
|
if ( dag ) {
|
||||||
|
result()()(0) = - even_0 - odd_0;
|
||||||
|
result()()(1) = - even_1 - odd_1;
|
||||||
|
result()()(2) = - even_2 - odd_2;
|
||||||
|
} else {
|
||||||
|
result()()(0) = even_0 + odd_0;
|
||||||
|
result()()(1) = even_1 + odd_1;
|
||||||
|
result()()(2) = even_2 + odd_2;
|
||||||
}
|
}
|
||||||
} else {
|
vstream(out._odata[sF],result);
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
}
|
||||||
{
|
|
||||||
MULT_ADD(Zp,even);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Tp
|
|
||||||
SE=st.GetEntry(ptype,Tp+skew,sF);
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
LOAD_CHI(in._odata);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_ADD(Tp,odd);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Xm
|
|
||||||
SE=st.GetEntry(ptype,Xm+skew,sF);
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
LOAD_CHI(in._odata);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_ADD(Xm,even);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Ym
|
|
||||||
SE=st.GetEntry(ptype,Ym+skew,sF);
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
LOAD_CHI(in._odata);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_ADD(Ym,odd);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Zm
|
|
||||||
SE=st.GetEntry(ptype,Zm+skew,sF);
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
LOAD_CHI(in._odata);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_ADD(Zm,even);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Tm
|
|
||||||
SE=st.GetEntry(ptype,Tm+skew,sF);
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
LOAD_CHI(in._odata);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_ADD(Tm,odd);
|
|
||||||
}
|
|
||||||
|
|
||||||
vstream(out()()(0),even_0+odd_0);
|
|
||||||
vstream(out()()(1),even_1+odd_1);
|
|
||||||
vstream(out()()(2),even_2+odd_2);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag)
|
||||||
|
{
|
||||||
|
typedef typename Simd::scalar_type S;
|
||||||
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
|
Simd even_0; // 12 regs on knc
|
||||||
|
Simd even_1;
|
||||||
|
Simd even_2;
|
||||||
|
Simd odd_0; // 12 regs on knc
|
||||||
|
Simd odd_1;
|
||||||
|
Simd odd_2;
|
||||||
|
|
||||||
|
Simd Chi_0; // two spinor; 6 regs
|
||||||
|
Simd Chi_1;
|
||||||
|
Simd Chi_2;
|
||||||
|
|
||||||
|
Simd U_00; // two rows of U matrix
|
||||||
|
Simd U_10;
|
||||||
|
Simd U_20;
|
||||||
|
Simd U_01;
|
||||||
|
Simd U_11;
|
||||||
|
Simd U_21; // 2 reg left.
|
||||||
|
Simd U_02;
|
||||||
|
Simd U_12;
|
||||||
|
Simd U_22;
|
||||||
|
|
||||||
|
SiteSpinor result;
|
||||||
|
int offset,local,perm, ptype;
|
||||||
|
|
||||||
|
StencilEntry *SE;
|
||||||
|
int skew;
|
||||||
|
|
||||||
|
for(int s=0;s<LLs;s++){
|
||||||
|
int sF=s+LLs*sU;
|
||||||
|
|
||||||
|
even_0 = zero; even_1 = zero; even_2 = zero;
|
||||||
|
odd_0 = zero; odd_1 = zero; odd_2 = zero;
|
||||||
|
|
||||||
|
skew = 0;
|
||||||
|
HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(U,Yp,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_INT(U,Zp,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(U,Tp,0,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_INT(U,Xm,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);
|
||||||
|
skew = 8;
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Zp,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Tp,0,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Xm,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);
|
||||||
|
|
||||||
|
// Assume every site must be connected to at least one interior point. No 1^4 subvols.
|
||||||
|
if ( dag ) {
|
||||||
|
result()()(0) = - even_0 - odd_0;
|
||||||
|
result()()(1) = - even_1 - odd_1;
|
||||||
|
result()()(2) = - even_2 - odd_2;
|
||||||
|
} else {
|
||||||
|
result()()(0) = even_0 + odd_0;
|
||||||
|
result()()(1) = even_1 + odd_1;
|
||||||
|
result()()(2) = even_2 + odd_2;
|
||||||
|
}
|
||||||
|
vstream(out._odata[sF],result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag)
|
||||||
|
{
|
||||||
|
typedef typename Simd::scalar_type S;
|
||||||
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
|
Simd even_0; // 12 regs on knc
|
||||||
|
Simd even_1;
|
||||||
|
Simd even_2;
|
||||||
|
Simd odd_0; // 12 regs on knc
|
||||||
|
Simd odd_1;
|
||||||
|
Simd odd_2;
|
||||||
|
|
||||||
|
Simd Chi_0; // two spinor; 6 regs
|
||||||
|
Simd Chi_1;
|
||||||
|
Simd Chi_2;
|
||||||
|
|
||||||
|
Simd U_00; // two rows of U matrix
|
||||||
|
Simd U_10;
|
||||||
|
Simd U_20;
|
||||||
|
Simd U_01;
|
||||||
|
Simd U_11;
|
||||||
|
Simd U_21; // 2 reg left.
|
||||||
|
Simd U_02;
|
||||||
|
Simd U_12;
|
||||||
|
Simd U_22;
|
||||||
|
|
||||||
|
SiteSpinor result;
|
||||||
|
int offset,local,perm, ptype;
|
||||||
|
|
||||||
|
StencilEntry *SE;
|
||||||
|
int skew;
|
||||||
|
|
||||||
|
for(int s=0;s<LLs;s++){
|
||||||
|
int sF=s+LLs*sU;
|
||||||
|
|
||||||
|
even_0 = zero; even_1 = zero; even_2 = zero;
|
||||||
|
odd_0 = zero; odd_1 = zero; odd_2 = zero;
|
||||||
|
int nmu=0;
|
||||||
|
skew = 0;
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Yp,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Zp,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Tp,0,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Xm,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);
|
||||||
|
skew = 8;
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Zp,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Tp,0,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Xm,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);
|
||||||
|
|
||||||
|
// Add sum of all exterior connected stencil legs
|
||||||
|
if ( nmu ) {
|
||||||
|
if ( dag ) {
|
||||||
|
result()()(0) = - even_0 - odd_0;
|
||||||
|
result()()(1) = - even_1 - odd_1;
|
||||||
|
result()()(2) = - even_2 - odd_2;
|
||||||
|
} else {
|
||||||
|
result()()(0) = even_0 + odd_0;
|
||||||
|
result()()(1) = even_1 + odd_1;
|
||||||
|
result()()(2) = even_2 + odd_2;
|
||||||
|
}
|
||||||
|
out._odata[sF] = out._odata[sF] + result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
|
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
|
||||||
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
|
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
|
||||||
DoubledGaugeField &U,DoubledGaugeField &UUU, \
|
DoubledGaugeField &U,DoubledGaugeField &UUU, \
|
||||||
SiteSpinor *buf, int LLs, \
|
SiteSpinor *buf, int LLs, int sU, \
|
||||||
int sU, const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag); \
|
||||||
|
\
|
||||||
|
template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
|
||||||
|
DoubledGaugeField &U,DoubledGaugeField &UUU, \
|
||||||
|
SiteSpinor *buf, int LLs, int sU, \
|
||||||
|
const FermionField &in, FermionField &out, int dag); \
|
||||||
|
\
|
||||||
|
template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
|
||||||
|
DoubledGaugeField &U,DoubledGaugeField &UUU, \
|
||||||
|
SiteSpinor *buf, int LLs, int sU, \
|
||||||
|
const FermionField &in, FermionField &out, int dag); \
|
||||||
|
|
||||||
#define DHOP_SITE_DEPTH_HAND_INSTANTIATE(IMPL) \
|
|
||||||
template void StaggeredKernels<IMPL>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, \
|
|
||||||
SiteSpinor *buf, int sF, \
|
|
||||||
int sU, const FermionField &in, SiteSpinor &out,int threeLink) ;
|
|
||||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
|
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
|
||||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
|
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
|
||||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
|
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
|
||||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
|
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
|
||||||
|
|
||||||
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplD);
|
|
||||||
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplF);
|
|
||||||
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplD);
|
|
||||||
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplF);
|
|
||||||
|
|
||||||
}}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -267,41 +267,16 @@ public:
|
|||||||
}
|
}
|
||||||
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
||||||
|
|
||||||
std::vector<int> same_node;
|
|
||||||
std::vector<int> surface_list;
|
|
||||||
|
|
||||||
WilsonStencil(GridBase *grid,
|
WilsonStencil(GridBase *grid,
|
||||||
int npoints,
|
int npoints,
|
||||||
int checkerboard,
|
int checkerboard,
|
||||||
const std::vector<int> &directions,
|
const std::vector<int> &directions,
|
||||||
const std::vector<int> &distances)
|
const std::vector<int> &distances)
|
||||||
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
|
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances)
|
||||||
same_node(npoints)
|
|
||||||
{
|
{
|
||||||
ZeroCountersi();
|
ZeroCountersi();
|
||||||
surface_list.resize(0);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
void BuildSurfaceList(int Ls,int vol4){
|
|
||||||
|
|
||||||
// find same node for SHM
|
|
||||||
// Here we know the distance is 1 for WilsonStencil
|
|
||||||
for(int point=0;point<this->_npoints;point++){
|
|
||||||
same_node[point] = this->SameNode(point);
|
|
||||||
}
|
|
||||||
|
|
||||||
for(int site = 0 ;site< vol4;site++){
|
|
||||||
int local = 1;
|
|
||||||
for(int point=0;point<this->_npoints;point++){
|
|
||||||
if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){
|
|
||||||
local = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(local == 0) {
|
|
||||||
surface_list.push_back(site);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template < class compressor>
|
template < class compressor>
|
||||||
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
|
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
|
||||||
@ -362,23 +337,23 @@ public:
|
|||||||
int dag = compress.dag;
|
int dag = compress.dag;
|
||||||
int face_idx=0;
|
int face_idx=0;
|
||||||
if ( dag ) {
|
if ( dag ) {
|
||||||
assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
|
assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
|
||||||
assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
|
assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
|
||||||
assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
|
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
|
||||||
assert(same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
|
assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
|
||||||
assert(same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
|
assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
|
||||||
assert(same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
|
assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
|
||||||
assert(same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
|
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
|
||||||
assert(same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
|
assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
|
||||||
} else {
|
} else {
|
||||||
assert(same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
|
assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
|
||||||
assert(same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
|
assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
|
||||||
assert(same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
|
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
|
||||||
assert(same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
|
assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
|
||||||
assert(same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
|
assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
|
||||||
assert(same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
|
assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
|
||||||
assert(same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
|
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
|
||||||
assert(same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
|
assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
|
||||||
}
|
}
|
||||||
this->face_table_computed=1;
|
this->face_table_computed=1;
|
||||||
assert(this->u_comm_offset==this->_unified_buffer_size);
|
assert(this->u_comm_offset==this->_unified_buffer_size);
|
||||||
|
@ -444,8 +444,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
ptime = usecond() - start;
|
ptime = usecond() - start;
|
||||||
}
|
} else {
|
||||||
{
|
|
||||||
double start = usecond();
|
double start = usecond();
|
||||||
st.CommunicateThreaded();
|
st.CommunicateThreaded();
|
||||||
ctime = usecond() - start;
|
ctime = usecond() - start;
|
||||||
|
@ -149,7 +149,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
std::vector<int> _distances;
|
std::vector<int> _distances;
|
||||||
std::vector<int> _comm_buf_size;
|
std::vector<int> _comm_buf_size;
|
||||||
std::vector<int> _permute_type;
|
std::vector<int> _permute_type;
|
||||||
|
std::vector<int> same_node;
|
||||||
|
std::vector<int> surface_list;
|
||||||
|
|
||||||
Vector<StencilEntry> _entries;
|
Vector<StencilEntry> _entries;
|
||||||
std::vector<Packet> Packets;
|
std::vector<Packet> Packets;
|
||||||
std::vector<Merge> Mergers;
|
std::vector<Merge> Mergers;
|
||||||
@ -200,7 +202,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
|
|
||||||
int dimension = _directions[point];
|
int dimension = _directions[point];
|
||||||
int displacement = _distances[point];
|
int displacement = _distances[point];
|
||||||
assert( (displacement==1) || (displacement==-1));
|
|
||||||
|
|
||||||
int pd = _grid->_processors[dimension];
|
int pd = _grid->_processors[dimension];
|
||||||
int fd = _grid->_fdimensions[dimension];
|
int fd = _grid->_fdimensions[dimension];
|
||||||
@ -215,9 +217,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
if ( ! comm_dim ) return 1;
|
if ( ! comm_dim ) return 1;
|
||||||
|
|
||||||
int nbr_proc;
|
int nbr_proc;
|
||||||
if (displacement==1) nbr_proc = 1;
|
if (displacement>0) nbr_proc = 1;
|
||||||
else nbr_proc = pd-1;
|
else nbr_proc = pd-1;
|
||||||
|
|
||||||
|
// FIXME this logic needs to be sorted for three link term
|
||||||
|
// assert( (displacement==1) || (displacement==-1));
|
||||||
|
// Present hack only works for >= 4^4 subvol per node
|
||||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p);
|
void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p);
|
||||||
@ -539,6 +544,29 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Move interior/exterior split into the generic stencil
|
||||||
|
// FIXME Explicit Ls in interface is a pain. Should just use a vol
|
||||||
|
void BuildSurfaceList(int Ls,int vol4){
|
||||||
|
|
||||||
|
// find same node for SHM
|
||||||
|
// Here we know the distance is 1 for WilsonStencil
|
||||||
|
for(int point=0;point<this->_npoints;point++){
|
||||||
|
same_node[point] = this->SameNode(point);
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int site = 0 ;site< vol4;site++){
|
||||||
|
int local = 1;
|
||||||
|
for(int point=0;point<this->_npoints;point++){
|
||||||
|
if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){
|
||||||
|
local = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(local == 0) {
|
||||||
|
surface_list.push_back(site);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
CartesianStencil(GridBase *grid,
|
CartesianStencil(GridBase *grid,
|
||||||
int npoints,
|
int npoints,
|
||||||
int checkerboard,
|
int checkerboard,
|
||||||
@ -549,7 +577,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
comm_bytes_thr(npoints),
|
comm_bytes_thr(npoints),
|
||||||
comm_enter_thr(npoints),
|
comm_enter_thr(npoints),
|
||||||
comm_leave_thr(npoints),
|
comm_leave_thr(npoints),
|
||||||
comm_time_thr(npoints)
|
comm_time_thr(npoints),
|
||||||
|
same_node(npoints)
|
||||||
{
|
{
|
||||||
face_table_computed=0;
|
face_table_computed=0;
|
||||||
_npoints = npoints;
|
_npoints = npoints;
|
||||||
@ -557,6 +586,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
_directions = directions;
|
_directions = directions;
|
||||||
_distances = distances;
|
_distances = distances;
|
||||||
_unified_buffer_size=0;
|
_unified_buffer_size=0;
|
||||||
|
surface_list.resize(0);
|
||||||
|
|
||||||
int osites = _grid->oSites();
|
int osites = _grid->oSites();
|
||||||
|
|
||||||
|
@ -360,8 +360,10 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
}
|
}
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
|
||||||
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute;
|
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute;
|
||||||
|
QCD::StaggeredKernelsStatic::Comms = QCD::StaggeredKernelsStatic::CommsAndCompute;
|
||||||
} else {
|
} else {
|
||||||
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
|
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
|
||||||
|
QCD::StaggeredKernelsStatic::Comms = QCD::StaggeredKernelsStatic::CommsThenCompute;
|
||||||
}
|
}
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){
|
||||||
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);
|
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user