1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-05 19:55:56 +01:00

Staggered overlap comms comput

This commit is contained in:
Azusa Yamaguchi 2018-01-09 13:02:52 +00:00
parent e55397bc13
commit 24162c9ead
10 changed files with 869 additions and 489 deletions

View File

@ -124,6 +124,15 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,
// Allocate the required comms buffer
ImportGauge(_Uthin,_Ufat);
int LLs = FiveDimGrid._rdimensions[0];
int vol4= FourDimGrid.oSites();
Stencil.BuildSurfaceList(LLs,vol4);
vol4=FourDimRedBlackGrid.oSites();
StencilEven.BuildSurfaceList(LLs,vol4);
StencilOdd.BuildSurfaceList(LLs,vol4);
}
template <class Impl>
@ -223,6 +232,157 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
assert(0);
}
/*CHANGE */
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag)
{
DhopTotalTime-=usecond();
#ifdef GRID_OMP
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
else
#endif
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
DhopTotalTime+=usecond();
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag)
{
#ifdef GRID_OMP
// assert((dag==DaggerNo) ||(dag==DaggerYes));
Compressor compressor;
int LLs = in._grid->_rdimensions[0];
int len = U._grid->oSites();
DhopFaceTime-=usecond();
st.Prepare();
st.HaloGather(in,compressor);
// st.HaloExchangeOptGather(in,compressor); // Wilson compressor
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
DhopFaceTime+=usecond();
double ctime=0;
double ptime=0;
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Ugly explicit thread mapping introduced for OPA reasons.
//////////////////////////////////////////////////////////////////////////////////////////////////////
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
{
int tid = omp_get_thread_num();
int nthreads = omp_get_num_threads();
int ncomms = CartesianCommunicator::nCommThreads;
if (ncomms == -1) ncomms = 1;
assert(nthreads > ncomms);
if (tid >= ncomms) {
double start = usecond();
nthreads -= ncomms;
int ttid = tid - ncomms;
int n = U._grid->oSites(); // 4d vol
int chunk = n / nthreads;
int rem = n % nthreads;
int myblock, myn;
if (ttid < rem) {
myblock = ttid * chunk + ttid;
myn = chunk+1;
} else {
myblock = ttid*chunk + rem;
myn = chunk;
}
// do the compute
if (dag == DaggerYes) {
for (int ss = myblock; ss < myblock+myn; ++ss) {
int sU = ss;
// Interior = 1; Exterior = 0; must implement for staggered
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<---------
}
} else {
for (int ss = myblock; ss < myblock+myn; ++ss) {
// Interior = 1; Exterior = 0;
int sU = ss;
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<------------
}
}
ptime = usecond() - start;
} else {
double start = usecond();
st.CommunicateThreaded();
ctime = usecond() - start;
}
}
DhopCommTime += ctime;
DhopComputeTime+=ptime;
// First to enter, last to leave timing
st.CollateThreads();
DhopFaceTime-=usecond();
st.CommsMerge(compressor);
DhopFaceTime+=usecond();
DhopComputeTime2-=usecond();
if (dag == DaggerYes) {
int sz=st.surface_list.size();
parallel_for (int ss = 0; ss < sz; ss++) {
int sU = st.surface_list[ss];
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1); //<----------
}
} else {
int sz=st.surface_list.size();
parallel_for (int ss = 0; ss < sz; ss++) {
int sU = st.surface_list[ss];
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1);//<----------
}
}
DhopComputeTime2+=usecond();
#else
assert(0);
#endif
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag)
{
Compressor compressor;
int LLs = in._grid->_rdimensions[0];
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
st.HaloExchange(in,compressor);
DhopCommTime += usecond();
DhopComputeTime -= usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
if (dag == DaggerYes) {
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU=ss;
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out);
}
} else {
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU=ss;
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
}
}
DhopComputeTime += usecond();
DhopTotalTime += usecond();
}
/*CHANGE END*/
/* ORG
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,DoubledGaugeField & UUU,
@ -254,6 +414,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
DhopComputeTime += usecond();
DhopTotalTime += usecond();
}
*/
template<class Impl>
@ -336,6 +497,9 @@ void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)
DhopTotalTime = 0;
DhopCommTime = 0;
DhopComputeTime = 0;
DhopFaceTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();

View File

@ -64,6 +64,8 @@ namespace QCD {
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
///////////////////////////////////////////////////////////////
// Implement the abstract base
@ -119,6 +121,23 @@ namespace QCD {
FermionField &out,
int dag);
void DhopInternalOverlappedComms(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
FermionField &out,
int dag);
void DhopInternalSerialComms(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
FermionField &out,
int dag);
// Constructors
ImprovedStaggeredFermion5D(GaugeField &_Uthin,
GaugeField &_Ufat,

View File

@ -32,223 +32,242 @@ namespace Grid {
namespace QCD {
int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \
SE = st.GetEntry(ptype, Dir+skew, sF); \
if (SE->_is_local ) { \
if (SE->_permute) { \
chi_p = &chi; \
permute(chi, in._odata[SE->_offset], ptype); \
} else { \
chi_p = &in._odata[SE->_offset]; \
} \
} else { \
chi_p = &buf[SE->_offset]; \
} \
multLink(Uchi, U._odata[sU], *chi_p, Dir);
#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \
SE = st.GetEntry(ptype, Dir+skew, sF); \
if (SE->_is_local ) { \
if (SE->_permute) { \
chi_p = &chi; \
permute(chi, in._odata[SE->_offset], ptype); \
} else { \
chi_p = &in._odata[SE->_offset]; \
} \
} else if ( st.same_node[Dir] ) { \
chi_p = &buf[SE->_offset]; \
} \
if (SE->_is_local || st.same_node[Dir] ) { \
multLink(Uchi, U._odata[sU], *chi_p, Dir); \
}
#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \
SE = st.GetEntry(ptype, Dir+skew, sF); \
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
nmu++; \
chi_p = &buf[SE->_offset]; \
multLink(Uchi, U._odata[sU], *chi_p, Dir); \
}
template <class Impl>
StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////
// Generic implementation; move to different file?
////////////////////////////////////////////
// Int, Ext, Int+Ext cases for comms overlap
////////////////////////////////////////////////////////////////////////////////////
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
SiteSpinor *buf, int sF,
int sU, const FermionField &in, SiteSpinor &out,int threeLink) {
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out, int dag) {
const SiteSpinor *chi_p;
SiteSpinor chi;
SiteSpinor Uchi;
StencilEntry *SE;
int ptype;
int skew = 0;
if (threeLink) skew=8;
///////////////////////////
// Xp
///////////////////////////
int skew;
SE = st.GetEntry(ptype, Xp+skew, sF);
if (SE->_is_local) {
if (SE->_permute) {
chi_p = &chi;
permute(chi, in._odata[SE->_offset], ptype);
} else {
chi_p = &in._odata[SE->_offset];
}
} else {
chi_p = &buf[SE->_offset];
for(int s=0;s<LLs;s++){
int sF=LLs*sU+s;
skew = 0;
GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Zp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Tp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Xm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
skew=8;
GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Zp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Tp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Xm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
if ( dag ) {
Uchi = - Uchi;
}
vstream(out._odata[sF], Uchi);
}
Impl::multLink(Uchi, U._odata[sU], *chi_p, Xp);
///////////////////////////
// Yp
///////////////////////////
SE = st.GetEntry(ptype, Yp+skew, sF);
if (SE->_is_local) {
if (SE->_permute) {
chi_p = &chi;
permute(chi, in._odata[SE->_offset], ptype);
} else {
chi_p = &in._odata[SE->_offset];
}
} else {
chi_p = &buf[SE->_offset];
}
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Yp);
///////////////////////////
// Zp
///////////////////////////
SE = st.GetEntry(ptype, Zp+skew, sF);
if (SE->_is_local) {
if (SE->_permute) {
chi_p = &chi;
permute(chi, in._odata[SE->_offset], ptype);
} else {
chi_p = &in._odata[SE->_offset];
}
} else {
chi_p = &buf[SE->_offset];
}
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zp);
///////////////////////////
// Tp
///////////////////////////
SE = st.GetEntry(ptype, Tp+skew, sF);
if (SE->_is_local) {
if (SE->_permute) {
chi_p = &chi;
permute(chi, in._odata[SE->_offset], ptype);
} else {
chi_p = &in._odata[SE->_offset];
}
} else {
chi_p = &buf[SE->_offset];
}
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tp);
///////////////////////////
// Xm
///////////////////////////
SE = st.GetEntry(ptype, Xm+skew, sF);
if (SE->_is_local) {
if (SE->_permute) {
chi_p = &chi;
permute(chi, in._odata[SE->_offset], ptype);
} else {
chi_p = &in._odata[SE->_offset];
}
} else {
chi_p = &buf[SE->_offset];
}
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Xm);
///////////////////////////
// Ym
///////////////////////////
SE = st.GetEntry(ptype, Ym+skew, sF);
if (SE->_is_local) {
if (SE->_permute) {
chi_p = &chi;
permute(chi, in._odata[SE->_offset], ptype);
} else {
chi_p = &in._odata[SE->_offset];
}
} else {
chi_p = &buf[SE->_offset];
}
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Ym);
///////////////////////////
// Zm
///////////////////////////
SE = st.GetEntry(ptype, Zm+skew, sF);
if (SE->_is_local) {
if (SE->_permute) {
chi_p = &chi;
permute(chi, in._odata[SE->_offset], ptype);
} else {
chi_p = &in._odata[SE->_offset];
}
} else {
chi_p = &buf[SE->_offset];
}
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zm);
///////////////////////////
// Tm
///////////////////////////
SE = st.GetEntry(ptype, Tm+skew, sF);
if (SE->_is_local) {
if (SE->_permute) {
chi_p = &chi;
permute(chi, in._odata[SE->_offset], ptype);
} else {
chi_p = &in._odata[SE->_offset];
}
} else {
chi_p = &buf[SE->_offset];
}
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tm);
vstream(out, Uchi);
};
///////////////////////////////////////////////////
// Only contributions from interior of our node
///////////////////////////////////////////////////
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag) {
const SiteSpinor *chi_p;
SiteSpinor chi;
SiteSpinor Uchi;
StencilEntry *SE;
int ptype;
int skew ;
for(int s=0;s<LLs;s++){
int sF=LLs*sU+s;
skew = 0;
Uchi=zero;
GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Yp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Zp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Tp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Xm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
skew=8;
GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Zp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Tp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Xm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
if ( dag ) {
Uchi = - Uchi;
}
vstream(out._odata[sF], Uchi);
}
};
///////////////////////////////////////////////////
// Only contributions from exterior of our node
///////////////////////////////////////////////////
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag) {
const SiteSpinor *chi_p;
SiteSpinor chi;
SiteSpinor Uchi;
StencilEntry *SE;
int ptype;
int nmu=0;
int skew ;
for(int s=0;s<LLs;s++){
int sF=LLs*sU+s;
skew = 0;
Uchi=zero;
GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Yp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Zp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Tp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Xm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
skew=8;
GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Zp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Tp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Xm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
if ( nmu ) {
if ( dag ) {
out._odata[sF] = out._odata[sF] - Uchi;
} else {
out._odata[sF] = out._odata[sF] + Uchi;
}
}
}
};
////////////////////////////////////////////////////////////////////////////////////
// Driving / wrapping routine to select right kernel
////////////////////////////////////////////////////////////////////////////////////
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out) {
SiteSpinor naik;
SiteSpinor naive;
int oneLink =0;
int threeLink=1;
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,
int interior,int exterior)
{
int dag=1;
switch(Opt) {
#ifdef AVX512
//FIXME; move the sign into the Asm routine
case OptInlineAsm:
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out);
for(int s=0;s<LLs;s++) {
int sF=s+LLs*sU;
out._odata[sF]=-out._odata[sF];
}
break;
#endif
case OptHandUnroll:
DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
break;
case OptGeneric:
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
out._odata[sF] =-naive-naik;
}
break;
default:
std::cout<<"Oops Opt = "<<Opt<<std::endl;
assert(0);
break;
}
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
};
template <class Impl>
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,
int interior,int exterior)
{
int dag=0;
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
};
template <class Impl>
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs,
int sU, const FermionField &in, FermionField &out)
int sU, const FermionField &in, FermionField &out,
int dag,int interior,int exterior)
{
int oneLink =0;
int threeLink=1;
SiteSpinor naik;
SiteSpinor naive;
int dag=0;
switch(Opt) {
#ifdef AVX512
case OptInlineAsm:
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out);
if ( interior && exterior ) {
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( interior ) {
DhopSiteAsmInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( exterior ) {
DhopSiteAsmExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
}
break;
#endif
case OptHandUnroll:
DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
if ( interior && exterior ) {
DhopSiteHand (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( interior ) {
DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( exterior ) {
DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
}
break;
case OptGeneric:
for(int s=0;s<LLs;s++){
int sF=LLs*sU+s;
// assert(sF<in._odata.size());
// assert(sU< U._odata.size());
// assert(sF>=0); assert(sU>=0);
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
out._odata[sF] =naive+naik;
if ( interior && exterior ) {
DhopSiteGeneric (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( interior ) {
DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( exterior ) {
DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
}
break;
default:

View File

@ -38,8 +38,9 @@ namespace QCD {
class StaggeredKernelsStatic {
public:
enum { OptGeneric, OptHandUnroll, OptInlineAsm };
// S-direction is INNERMOST and takes no part in the parity.
static int Opt; // these are a temporary hack
enum { CommsAndCompute, CommsThenCompute };
static int Opt;
static int Comms;
};
template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , public StaggeredKernelsStatic {
@ -53,24 +54,70 @@ public:
void DhopDir(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out, int dir,int disp);
void DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf,
int sF, int sU, const FermionField &in, SiteSpinor &out,int threeLink);
///////////////////////////////////////////////////////////////////////////////////////
// Generic Nc kernels
///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
///////////////////////////////////////////////////////////////////////////////////////
// Nc=3 specific kernels
///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
void DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf,
int sF, int sU, const FermionField &in, SiteSpinor&out,int threeLink);
///////////////////////////////////////////////////////////////////////////////////////
// Asm Nc=3 specific kernels
///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
void DhopSiteAsmInt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
void DhopSiteAsmExt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
///////////////////////////////////////////////////////////////////////////////////////////////////
// Generic interface; fan out to right routine
///////////////////////////////////////////////////////////////////////////////////////////////////
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out, int interior=1,int exterior=1);
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,SiteSpinor * buf,
int LLs, int sU, const FermionField &in, FermionField &out, int dag);
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out, int interior=1,int exterior=1);
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, SiteSpinor * buf,
int LLs, int sU, const FermionField &in, FermionField &out);
void DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor *buf,
int LLs, int sU, const FermionField &in, FermionField &out);
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
public:

View File

@ -560,16 +560,53 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VSTORE(2,%0,pUChi_02) \
: : "r" (out) : "memory" );
#define nREDUCE(out) \
asm ( \
VADD(UChi_00,UChi_10,UChi_00) \
VADD(UChi_01,UChi_11,UChi_01) \
VADD(UChi_02,UChi_12,UChi_02) \
VADD(UChi_30,UChi_20,UChi_30) \
VADD(UChi_31,UChi_21,UChi_31) \
VADD(UChi_32,UChi_22,UChi_32) \
VADD(UChi_00,UChi_30,UChi_00) \
VADD(UChi_01,UChi_31,UChi_01) \
VADD(UChi_02,UChi_32,UChi_02) ); \
asm (VZERO(Chi_00) \
VSUB(UChi_00,Chi_00,UChi_00) \
VSUB(UChi_01,Chi_00,UChi_01) \
VSUB(UChi_02,Chi_00,UChi_02) ); \
asm ( \
VSTORE(0,%0,pUChi_00) \
VSTORE(1,%0,pUChi_01) \
VSTORE(2,%0,pUChi_02) \
: : "r" (out) : "memory" );
#define REDUCEa(out) \
asm ( \
VADD(UChi_00,UChi_10,UChi_00) \
VADD(UChi_01,UChi_11,UChi_01) \
VADD(UChi_02,UChi_12,UChi_02) ); \
asm ( \
VSTORE(0,%0,pUChi_00) \
VSTORE(1,%0,pUChi_01) \
VSTORE(2,%0,pUChi_02) \
: : "r" (out) : "memory" );
// FIXME is sign right in the VSUB ?
#define nREDUCEa(out) \
asm ( \
VSTORE(0,%0,pUChi_00) \
VSTORE(1,%0,pUChi_01) \
VSTORE(2,%0,pUChi_02) \
: : "r" (out) : "memory" );
VADD(UChi_00,UChi_10,UChi_00) \
VADD(UChi_01,UChi_11,UChi_01) \
VADD(UChi_02,UChi_12,UChi_02) ); \
asm (VZERO(Chi_00) \
VSUB(UChi_00,Chi_00,UChi_00) \
VSUB(UChi_01,Chi_00,UChi_01) \
VSUB(UChi_02,Chi_00,UChi_02) ); \
asm ( \
VSTORE(0,%0,pUChi_00) \
VSTORE(1,%0,pUChi_01) \
VSTORE(2,%0,pUChi_02) \
: : "r" (out) : "memory" );
#define PERMUTE_DIR(dir) \
permute##dir(Chi_0,Chi_0);\
@ -581,10 +618,9 @@ namespace QCD {
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs,
int sU, const FermionField &in, FermionField &out)
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
assert(0);
};
@ -645,10 +681,9 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
// This is the single precision 5th direction vectorised kernel
#include <simd/Intel512single.h>
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs,
int sU, const FermionField &in, FermionField &out)
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
#ifdef AVX512
uint64_t gauge0,gauge1,gauge2,gauge3;
@ -685,7 +720,11 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
addr0 = (uint64_t) &out._odata[sF];
REDUCE(addr0);
if ( dag ) {
nREDUCE(addr0);
} else {
REDUCE(addr0);
}
}
#else
assert(0);
@ -695,10 +734,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
#include <simd/Intel512double.h>
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs,
int sU, const FermionField &in, FermionField &out)
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
#ifdef AVX512
uint64_t gauge0,gauge1,gauge2,gauge3;
@ -734,7 +772,11 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
addr0 = (uint64_t) &out._odata[sF];
REDUCE(addr0);
if ( dag ) {
nREDUCE(addr0);
} else {
REDUCE(addr0);
}
}
#else
assert(0);
@ -776,10 +818,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
#include <simd/Intel512single.h>
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs,
int sU, const FermionField &in, FermionField &out)
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
#ifdef AVX512
uint64_t gauge0,gauge1,gauge2,gauge3;
@ -832,7 +873,11 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
MULT_ADD_XYZT(gauge2,gauge3);
addr0 = (uint64_t) &out._odata[sF];
REDUCEa(addr0);
if ( dag ) {
nREDUCEa(addr0);
} else {
REDUCEa(addr0);
}
}
#else
assert(0);
@ -841,10 +886,9 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
#include <simd/Intel512double.h>
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs,
int sU, const FermionField &in, FermionField &out)
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
#ifdef AVX512
uint64_t gauge0,gauge1,gauge2,gauge3;
@ -897,7 +941,11 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
MULT_ADD_XYZT(gauge2,gauge3);
addr0 = (uint64_t) &out._odata[sF];
REDUCEa(addr0);
if ( dag ) {
nREDUCEa(addr0);
} else {
REDUCEa(addr0);
}
}
#else
assert(0);
@ -909,7 +957,7 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
DoubledGaugeField &U, \
DoubledGaugeField &UUU, \
SiteSpinor *buf, int LLs, \
int sU, const FermionField &in, FermionField &out);
int sU, const FermionField &in, FermionField &out,int dag);
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD);
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF);

View File

@ -28,7 +28,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */
#include <Grid.h>
#define REGISTER
#define LOAD_CHI(b) \
const SiteSpinor & ref (b[offset]); \
@ -59,7 +58,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
UChi ## _1 += U_12*Chi_2;\
UChi ## _2 += U_22*Chi_2;
#define MULT_ADD(A,UChi) \
#define MULT_ADD(U,A,UChi) \
auto & ref(U._odata[sU](A)); \
Impl::loadLinkElement(U_00,ref()(0,0)); \
Impl::loadLinkElement(U_10,ref()(1,0)); \
@ -82,241 +81,319 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define PERMUTE_DIR(dir) \
permute##dir(Chi_0,Chi_0);\
permute##dir(Chi_1,Chi_1);\
permute##dir(Chi_2,Chi_2);
permute##dir(Chi_0,Chi_0); \
permute##dir(Chi_1,Chi_1); \
permute##dir(Chi_2,Chi_2);
#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
SE=st.GetEntry(ptype,Dir+skew,sF); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHI(in._odata); \
if ( perm) { \
PERMUTE_DIR(Perm); \
} \
} else { \
LOAD_CHI(buf); \
}
#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \
HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
{ \
MULT(Dir,even); \
}
#define HAND_STENCIL_LEG(U,Dir,Perm,skew,even) \
HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
{ \
MULT_ADD(U,Dir,even); \
}
#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \
SE=st.GetEntry(ptype,Dir+skew,sF); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHI(in._odata); \
if ( perm) { \
PERMUTE_DIR(Perm); \
} \
} else if ( st.same_node[Dir] ) { \
LOAD_CHI(buf); \
} \
if (SE->_is_local || st.same_node[Dir] ) { \
MULT_ADD(U,Dir,even); \
}
#define HAND_STENCIL_LEG_EXT(U,Dir,Perm,skew,even) \
SE=st.GetEntry(ptype,Dir+skew,sF); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
nmu++; \
{ LOAD_CHI(buf); } \
{ MULT_ADD(U,Dir,even); } \
}
namespace Grid {
namespace QCD {
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs,
int sU, const FermionField &in, FermionField &out, int dag)
{
SiteSpinor naik;
SiteSpinor naive;
int oneLink =0;
int threeLink=1;
int skew(0);
Real scale(1.0);
if(dag) scale = -1.0;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
DhopSiteDepthHand(st,lo,U,buf,sF,sU,in,naive,oneLink);
DhopSiteDepthHand(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
out._odata[sF] =scale*(naive+naik);
}
}
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
SiteSpinor *buf, int sF,
int sU, const FermionField &in, SiteSpinor &out,int threeLink)
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
REGISTER Simd even_0; // 12 regs on knc
REGISTER Simd even_1;
REGISTER Simd even_2;
REGISTER Simd odd_0; // 12 regs on knc
REGISTER Simd odd_1;
REGISTER Simd odd_2;
Simd even_0; // 12 regs on knc
Simd even_1;
Simd even_2;
Simd odd_0; // 12 regs on knc
Simd odd_1;
Simd odd_2;
REGISTER Simd Chi_0; // two spinor; 6 regs
REGISTER Simd Chi_1;
REGISTER Simd Chi_2;
REGISTER Simd U_00; // two rows of U matrix
REGISTER Simd U_10;
REGISTER Simd U_20;
REGISTER Simd U_01;
REGISTER Simd U_11;
REGISTER Simd U_21; // 2 reg left.
REGISTER Simd U_02;
REGISTER Simd U_12;
REGISTER Simd U_22;
int skew = 0;
if (threeLink) skew=8;
Simd Chi_0; // two spinor; 6 regs
Simd Chi_1;
Simd Chi_2;
Simd U_00; // two rows of U matrix
Simd U_10;
Simd U_20;
Simd U_01;
Simd U_11;
Simd U_21; // 2 reg left.
Simd U_02;
Simd U_12;
Simd U_22;
SiteSpinor result;
int offset,local,perm, ptype;
StencilEntry *SE;
int skew;
// Xp
SE=st.GetEntry(ptype,Xp+skew,sF);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHI(in._odata);
if ( perm) {
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(buf);
}
{
MULT(Xp,even);
}
// Yp
SE=st.GetEntry(ptype,Yp+skew,sF);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHI(in._odata);
if ( perm) {
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(buf);
}
{
MULT(Yp,odd);
}
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
// Zp
SE=st.GetEntry(ptype,Zp+skew,sF);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHI(in._odata);
if ( perm) {
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
skew = 0;
HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);
HAND_STENCIL_LEG_BEGIN(Yp,2,skew,odd);
HAND_STENCIL_LEG (U,Zp,1,skew,even);
HAND_STENCIL_LEG (U,Tp,0,skew,odd);
HAND_STENCIL_LEG (U,Xm,3,skew,even);
HAND_STENCIL_LEG (U,Ym,2,skew,odd);
HAND_STENCIL_LEG (U,Zm,1,skew,even);
HAND_STENCIL_LEG (U,Tm,0,skew,odd);
skew = 8;
HAND_STENCIL_LEG(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);
HAND_STENCIL_LEG(UUU,Zp,1,skew,even);
HAND_STENCIL_LEG(UUU,Tp,0,skew,odd);
HAND_STENCIL_LEG(UUU,Xm,3,skew,even);
HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);
if ( dag ) {
result()()(0) = - even_0 - odd_0;
result()()(1) = - even_1 - odd_1;
result()()(2) = - even_2 - odd_2;
} else {
result()()(0) = even_0 + odd_0;
result()()(1) = even_1 + odd_1;
result()()(2) = even_2 + odd_2;
}
} else {
LOAD_CHI(buf);
vstream(out._odata[sF],result);
}
{
MULT_ADD(Zp,even);
}
// Tp
SE=st.GetEntry(ptype,Tp+skew,sF);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHI(in._odata);
if ( perm) {
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(buf);
}
{
MULT_ADD(Tp,odd);
}
// Xm
SE=st.GetEntry(ptype,Xm+skew,sF);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHI(in._odata);
if ( perm) {
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(buf);
}
{
MULT_ADD(Xm,even);
}
// Ym
SE=st.GetEntry(ptype,Ym+skew,sF);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHI(in._odata);
if ( perm) {
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(buf);
}
{
MULT_ADD(Ym,odd);
}
// Zm
SE=st.GetEntry(ptype,Zm+skew,sF);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHI(in._odata);
if ( perm) {
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(buf);
}
{
MULT_ADD(Zm,even);
}
// Tm
SE=st.GetEntry(ptype,Tm+skew,sF);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHI(in._odata);
if ( perm) {
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(buf);
}
{
MULT_ADD(Tm,odd);
}
vstream(out()()(0),even_0+odd_0);
vstream(out()()(1),even_1+odd_1);
vstream(out()()(2),even_2+odd_2);
}
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
Simd even_0; // 12 regs on knc
Simd even_1;
Simd even_2;
Simd odd_0; // 12 regs on knc
Simd odd_1;
Simd odd_2;
Simd Chi_0; // two spinor; 6 regs
Simd Chi_1;
Simd Chi_2;
Simd U_00; // two rows of U matrix
Simd U_10;
Simd U_20;
Simd U_01;
Simd U_11;
Simd U_21; // 2 reg left.
Simd U_02;
Simd U_12;
Simd U_22;
SiteSpinor result;
int offset,local,perm, ptype;
StencilEntry *SE;
int skew;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
even_0 = zero; even_1 = zero; even_2 = zero;
odd_0 = zero; odd_1 = zero; odd_2 = zero;
skew = 0;
HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);
HAND_STENCIL_LEG_INT(U,Yp,2,skew,odd);
HAND_STENCIL_LEG_INT(U,Zp,1,skew,even);
HAND_STENCIL_LEG_INT(U,Tp,0,skew,odd);
HAND_STENCIL_LEG_INT(U,Xm,3,skew,even);
HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);
HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);
HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);
skew = 8;
HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);
HAND_STENCIL_LEG_INT(UUU,Zp,1,skew,even);
HAND_STENCIL_LEG_INT(UUU,Tp,0,skew,odd);
HAND_STENCIL_LEG_INT(UUU,Xm,3,skew,even);
HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);
// Assume every site must be connected to at least one interior point. No 1^4 subvols.
if ( dag ) {
result()()(0) = - even_0 - odd_0;
result()()(1) = - even_1 - odd_1;
result()()(2) = - even_2 - odd_2;
} else {
result()()(0) = even_0 + odd_0;
result()()(1) = even_1 + odd_1;
result()()(2) = even_2 + odd_2;
}
vstream(out._odata[sF],result);
}
}
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
Simd even_0; // 12 regs on knc
Simd even_1;
Simd even_2;
Simd odd_0; // 12 regs on knc
Simd odd_1;
Simd odd_2;
Simd Chi_0; // two spinor; 6 regs
Simd Chi_1;
Simd Chi_2;
Simd U_00; // two rows of U matrix
Simd U_10;
Simd U_20;
Simd U_01;
Simd U_11;
Simd U_21; // 2 reg left.
Simd U_02;
Simd U_12;
Simd U_22;
SiteSpinor result;
int offset,local,perm, ptype;
StencilEntry *SE;
int skew;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
even_0 = zero; even_1 = zero; even_2 = zero;
odd_0 = zero; odd_1 = zero; odd_2 = zero;
int nmu=0;
skew = 0;
HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);
HAND_STENCIL_LEG_EXT(U,Yp,2,skew,odd);
HAND_STENCIL_LEG_EXT(U,Zp,1,skew,even);
HAND_STENCIL_LEG_EXT(U,Tp,0,skew,odd);
HAND_STENCIL_LEG_EXT(U,Xm,3,skew,even);
HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);
HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);
HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);
skew = 8;
HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);
HAND_STENCIL_LEG_EXT(UUU,Zp,1,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Tp,0,skew,odd);
HAND_STENCIL_LEG_EXT(UUU,Xm,3,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);
// Add sum of all exterior connected stencil legs
if ( nmu ) {
if ( dag ) {
result()()(0) = - even_0 - odd_0;
result()()(1) = - even_1 - odd_1;
result()()(2) = - even_2 - odd_2;
} else {
result()()(0) = even_0 + odd_0;
result()()(1) = even_1 + odd_1;
result()()(2) = even_2 + odd_2;
}
out._odata[sF] = out._odata[sF] + result;
}
}
}
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
DoubledGaugeField &U,DoubledGaugeField &UUU, \
SiteSpinor *buf, int LLs, \
int sU, const FermionField &in, FermionField &out, int dag);
SiteSpinor *buf, int LLs, int sU, \
const FermionField &in, FermionField &out, int dag); \
\
template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
DoubledGaugeField &U,DoubledGaugeField &UUU, \
SiteSpinor *buf, int LLs, int sU, \
const FermionField &in, FermionField &out, int dag); \
\
template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
DoubledGaugeField &U,DoubledGaugeField &UUU, \
SiteSpinor *buf, int LLs, int sU, \
const FermionField &in, FermionField &out, int dag); \
#define DHOP_SITE_DEPTH_HAND_INSTANTIATE(IMPL) \
template void StaggeredKernels<IMPL>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, \
SiteSpinor *buf, int sF, \
int sU, const FermionField &in, SiteSpinor &out,int threeLink) ;
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplD);
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplF);
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplD);
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplF);
}}
}
}

View File

@ -267,41 +267,16 @@ public:
}
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
std::vector<int> same_node;
std::vector<int> surface_list;
WilsonStencil(GridBase *grid,
int npoints,
int checkerboard,
const std::vector<int> &directions,
const std::vector<int> &distances)
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
same_node(npoints)
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances)
{
ZeroCountersi();
surface_list.resize(0);
};
void BuildSurfaceList(int Ls,int vol4){
// find same node for SHM
// Here we know the distance is 1 for WilsonStencil
for(int point=0;point<this->_npoints;point++){
same_node[point] = this->SameNode(point);
}
for(int site = 0 ;site< vol4;site++){
int local = 1;
for(int point=0;point<this->_npoints;point++){
if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){
local = 0;
}
}
if(local == 0) {
surface_list.push_back(site);
}
}
}
template < class compressor>
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
@ -362,23 +337,23 @@ public:
int dag = compress.dag;
int face_idx=0;
if ( dag ) {
assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
assert(same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
assert(same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
assert(same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
assert(same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
assert(same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
} else {
assert(same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
assert(same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
assert(same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
assert(same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
assert(same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
assert(same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
assert(same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
assert(same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
}
this->face_table_computed=1;
assert(this->u_comm_offset==this->_unified_buffer_size);

View File

@ -444,8 +444,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
}
}
ptime = usecond() - start;
}
{
} else {
double start = usecond();
st.CommunicateThreaded();
ctime = usecond() - start;

View File

@ -149,7 +149,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
std::vector<int> _distances;
std::vector<int> _comm_buf_size;
std::vector<int> _permute_type;
std::vector<int> same_node;
std::vector<int> surface_list;
Vector<StencilEntry> _entries;
std::vector<Packet> Packets;
std::vector<Merge> Mergers;
@ -200,7 +202,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
int dimension = _directions[point];
int displacement = _distances[point];
assert( (displacement==1) || (displacement==-1));
int pd = _grid->_processors[dimension];
int fd = _grid->_fdimensions[dimension];
@ -215,9 +217,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
if ( ! comm_dim ) return 1;
int nbr_proc;
if (displacement==1) nbr_proc = 1;
else nbr_proc = pd-1;
if (displacement>0) nbr_proc = 1;
else nbr_proc = pd-1;
// FIXME this logic needs to be sorted for three link term
// assert( (displacement==1) || (displacement==-1));
// Present hack only works for >= 4^4 subvol per node
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p);
@ -539,6 +544,29 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
}
};
// Move interior/exterior split into the generic stencil
// FIXME Explicit Ls in interface is a pain. Should just use a vol
void BuildSurfaceList(int Ls,int vol4){
// find same node for SHM
// Here we know the distance is 1 for WilsonStencil
for(int point=0;point<this->_npoints;point++){
same_node[point] = this->SameNode(point);
}
for(int site = 0 ;site< vol4;site++){
int local = 1;
for(int point=0;point<this->_npoints;point++){
if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){
local = 0;
}
}
if(local == 0) {
surface_list.push_back(site);
}
}
}
CartesianStencil(GridBase *grid,
int npoints,
int checkerboard,
@ -549,7 +577,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
comm_bytes_thr(npoints),
comm_enter_thr(npoints),
comm_leave_thr(npoints),
comm_time_thr(npoints)
comm_time_thr(npoints),
same_node(npoints)
{
face_table_computed=0;
_npoints = npoints;
@ -557,6 +586,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
_directions = directions;
_distances = distances;
_unified_buffer_size=0;
surface_list.resize(0);
int osites = _grid->oSites();

View File

@ -360,8 +360,10 @@ void Grid_init(int *argc,char ***argv)
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute;
QCD::StaggeredKernelsStatic::Comms = QCD::StaggeredKernelsStatic::CommsAndCompute;
} else {
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
QCD::StaggeredKernelsStatic::Comms = QCD::StaggeredKernelsStatic::CommsThenCompute;
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);