mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-10 06:00:45 +01:00
Staggaered fermion optimised version
This commit is contained in:
parent
05c1924819
commit
bf7e3f20d4
@ -237,19 +237,32 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
|
|||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
||||||
for(int s=0;s<LLs;s++){
|
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
int sF=s+LLs*sU;
|
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out);
|
||||||
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), sF, sU, in, out);
|
}
|
||||||
}}
|
|
||||||
} else {
|
} else {
|
||||||
|
#if 1
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
||||||
for(int s=0;s<LLs;s++){
|
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
int sF=s+LLs*sU;
|
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
|
||||||
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),sF,sU,in,out);
|
}
|
||||||
}}
|
#else
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
for(int i=0;i<10;i++){
|
||||||
|
int len = U._grid->oSites();
|
||||||
|
int me,mywork,myoff;
|
||||||
|
GridThread::GetWorkBarrier(len,me, mywork,myoff);
|
||||||
|
for (int ss = myoff; ss < myoff+mywork; ss++) {
|
||||||
|
int sU=ss;
|
||||||
|
int sF=LLs*sU;
|
||||||
|
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
|
||||||
|
}
|
||||||
|
GridThread::ThreadBarrier();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -182,65 +182,81 @@ void StaggeredKernels<Impl>::DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, D
|
|||||||
vstream(out, Uchi);
|
vstream(out, Uchi);
|
||||||
};
|
};
|
||||||
|
|
||||||
// Need controls to do interior, exterior, or both
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
SiteSpinor *buf, int sF,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
int sU, const FermionField &in, FermionField &out) {
|
const FermionField &in, FermionField &out) {
|
||||||
|
int dag(1);
|
||||||
SiteSpinor naik;
|
SiteSpinor naik;
|
||||||
SiteSpinor naive;
|
SiteSpinor naive;
|
||||||
int oneLink =0;
|
int oneLink =0;
|
||||||
int threeLink=1;
|
int threeLink=1;
|
||||||
|
Real scale;
|
||||||
|
if(dag) scale = -1.0;
|
||||||
|
else scale = 1.0;
|
||||||
|
|
||||||
switch(Opt) {
|
switch(Opt) {
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
case OptInlineAsm:
|
case OptInlineAsm:
|
||||||
DhopSiteAsm(st,lo,U,UUU,buf,sF,sU,in,out._odata[sF]);
|
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out);
|
||||||
break;
|
break;
|
||||||
#endif
|
#endif
|
||||||
case OptHandUnroll:
|
case OptHandUnroll:
|
||||||
DhopSiteDepthHand(st,lo,U,buf,sF,sU,in,naive,oneLink);
|
DhopSiteDepthHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
DhopSiteDepthHand(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
|
|
||||||
out._odata[sF] =-naive-naik;
|
|
||||||
break;
|
break;
|
||||||
case OptGeneric:
|
case OptGeneric:
|
||||||
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
|
for(int s=0;s<LLs;s++){
|
||||||
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
|
|
||||||
out._odata[sF] =-naive-naik;
|
int sF=s+LLs*sU;
|
||||||
|
|
||||||
|
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
|
||||||
|
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
|
||||||
|
out._odata[sF] =scale*(naive+naik);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
assert(0);
|
assert(0);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
SiteSpinor *buf, int sF,
|
SiteSpinor *buf, int LLs,
|
||||||
int sU, const FermionField &in, FermionField &out) {
|
int sU, const FermionField &in, FermionField &out) {
|
||||||
int oneLink =0;
|
|
||||||
int threeLink=1;
|
int dag(0);
|
||||||
SiteSpinor naik;
|
|
||||||
SiteSpinor naive;
|
int oneLink =0;
|
||||||
static int once;
|
int threeLink=1;
|
||||||
|
SiteSpinor naik;
|
||||||
|
SiteSpinor naive;
|
||||||
|
static int once;
|
||||||
|
int sF=LLs*sU;
|
||||||
|
|
||||||
switch(Opt) {
|
switch(Opt) {
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
case OptInlineAsm:
|
case OptInlineAsm:
|
||||||
DhopSiteAsm(st,lo,U,UUU,buf,sF,sU,in,out._odata[sF]);
|
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out);
|
||||||
break;
|
break;
|
||||||
#endif
|
#endif
|
||||||
case OptHandUnroll:
|
case OptHandUnroll:
|
||||||
DhopSiteDepthHand(st,lo,U,buf,sF,sU,in,naive,oneLink);
|
DhopSiteDepthHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
DhopSiteDepthHand(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
|
break;
|
||||||
out._odata[sF] =naive+naik;
|
|
||||||
break;
|
|
||||||
case OptGeneric:
|
case OptGeneric:
|
||||||
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
|
|
||||||
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
|
for(int s=0;s<LLs;s++){
|
||||||
out._odata[sF] =naive+naik;
|
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
|
||||||
|
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
|
||||||
|
out._odata[sF] =naive+naik;
|
||||||
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
assert(0);
|
assert(0);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
|
@ -56,17 +56,21 @@ public:
|
|||||||
void DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf,
|
void DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf,
|
||||||
int sF, int sU, const FermionField &in, SiteSpinor &out,int threeLink);
|
int sF, int sU, const FermionField &in, SiteSpinor &out,int threeLink);
|
||||||
|
|
||||||
void DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf,
|
|
||||||
int sF, int sU, const FermionField &in, SiteSpinor &out,int threeLink);
|
void DhopSiteDepthHandLocal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf,
|
||||||
|
int sF, int sU, const FermionField &in, SiteSpinor&out,int threeLink);
|
||||||
|
|
||||||
|
void DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,SiteSpinor * buf,
|
||||||
|
int Lls, int sU, const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, SiteSpinor * buf,
|
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, SiteSpinor * buf,
|
||||||
int sF, int sU, const FermionField &in, SiteSpinor &out);
|
int LLs, int sU, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
void DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
|
void DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
|
||||||
int sF, int sU, const FermionField &in, FermionField &out);
|
int sF, int sU, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
|
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor *buf,
|
||||||
int sF, int sU, const FermionField &in, FermionField &out);
|
int LLs, int sU, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
@ -507,13 +507,37 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VLOAD(2,%%r8,pChi_12) \
|
VLOAD(2,%%r8,pChi_12) \
|
||||||
: : "r" (a1) : "%r8" );
|
: : "r" (a1) : "%r8" );
|
||||||
|
|
||||||
#define PF_CHI(a0) \
|
#define PF_CHI(a0)
|
||||||
|
#define PF_CHIa(a0) \
|
||||||
asm ( \
|
asm ( \
|
||||||
"movq %0, %%r8 \n\t" \
|
"movq %0, %%r8 \n\t" \
|
||||||
VPREFETCH1(0,%%r8) \
|
VPREFETCH1(0,%%r8) \
|
||||||
VPREFETCH1(1,%%r8) \
|
VPREFETCH1(1,%%r8) \
|
||||||
VPREFETCH1(2,%%r8) \
|
VPREFETCH1(2,%%r8) \
|
||||||
: : "r" (a0) : "%r8" ); \
|
: : "r" (a0) : "%r8" ); \
|
||||||
|
|
||||||
|
#define PF_GAUGE_XYZT(a0)
|
||||||
|
#define PF_GAUGE_XYZTa(a0) \
|
||||||
|
asm ( \
|
||||||
|
"movq %0, %%r8 \n\t" \
|
||||||
|
VPREFETCH1(0,%%r8) \
|
||||||
|
VPREFETCH1(1,%%r8) \
|
||||||
|
VPREFETCH1(2,%%r8) \
|
||||||
|
VPREFETCH1(3,%%r8) \
|
||||||
|
VPREFETCH1(4,%%r8) \
|
||||||
|
VPREFETCH1(5,%%r8) \
|
||||||
|
VPREFETCH1(6,%%r8) \
|
||||||
|
VPREFETCH1(7,%%r8) \
|
||||||
|
VPREFETCH1(8,%%r8) \
|
||||||
|
: : "r" (a0) : "%r8" ); \
|
||||||
|
|
||||||
|
#define PF_GAUGE_LS(a0)
|
||||||
|
#define PF_GAUGE_LSa(a0) \
|
||||||
|
asm ( \
|
||||||
|
"movq %0, %%r8 \n\t" \
|
||||||
|
VPREFETCH1(0,%%r8) \
|
||||||
|
VPREFETCH1(1,%%r8) \
|
||||||
|
: : "r" (a0) : "%r8" ); \
|
||||||
|
|
||||||
|
|
||||||
#define REDUCE(out) \
|
#define REDUCE(out) \
|
||||||
@ -556,40 +580,59 @@ template <class Impl>
|
|||||||
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
SiteSpinor *buf, int sF,
|
SiteSpinor *buf, int LLs,
|
||||||
int sU, const FermionField &in, SiteSpinor &out)
|
int sU, const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
|
|
||||||
}
|
};
|
||||||
|
|
||||||
#define PREPARE(X,Y,Z,T,skew,UU) \
|
|
||||||
SE0=st.GetEntry(ptype,X+skew,sF); \
|
//#define CONDITIONAL_MOVE(l,o,out) if ( l ) { out = (uint64_t) &in._odata[o] ; } else { out =(uint64_t) &buf[o]; }
|
||||||
o0 = SE0->_offset; \
|
|
||||||
l0 = SE0->_is_local; \
|
#define CONDITIONAL_MOVE(l,o,out) { const SiteSpinor *ptr = l? in_p : buf; out = (uint64_t) &ptr[o]; }
|
||||||
p0 = SE0->_permute; \
|
|
||||||
addr0 = l0 ? (uint64_t) &in._odata[o0] : (uint64_t) &buf[o0]; \
|
#define PREPARE_XYZT(X,Y,Z,T,skew,UU) \
|
||||||
|
PREPARE(X,Y,Z,T,skew,UU); \
|
||||||
|
PF_GAUGE_XYZT(gauge0); \
|
||||||
|
PF_GAUGE_XYZT(gauge1); \
|
||||||
|
PF_GAUGE_XYZT(gauge2); \
|
||||||
|
PF_GAUGE_XYZT(gauge3);
|
||||||
|
|
||||||
|
#define PREPARE_LS(X,Y,Z,T,skew,UU) \
|
||||||
|
PREPARE(X,Y,Z,T,skew,UU); \
|
||||||
|
PF_GAUGE_LS(gauge0); \
|
||||||
|
PF_GAUGE_LS(gauge1); \
|
||||||
|
PF_GAUGE_LS(gauge2); \
|
||||||
|
PF_GAUGE_LS(gauge3);
|
||||||
|
|
||||||
|
#define PREPARE(X,Y,Z,T,skew,UU) \
|
||||||
|
SE0=st.GetEntry(ptype,X+skew,sF); \
|
||||||
|
o0 = SE0->_offset; \
|
||||||
|
l0 = SE0->_is_local; \
|
||||||
|
p0 = SE0->_permute; \
|
||||||
|
CONDITIONAL_MOVE(l0,o0,addr0); \
|
||||||
PF_CHI(addr0); \
|
PF_CHI(addr0); \
|
||||||
\
|
\
|
||||||
SE1=st.GetEntry(ptype,Y+skew,sF); \
|
SE1=st.GetEntry(ptype,Y+skew,sF); \
|
||||||
o1 = SE1->_offset; \
|
o1 = SE1->_offset; \
|
||||||
l1 = SE1->_is_local; \
|
l1 = SE1->_is_local; \
|
||||||
p1 = SE1->_permute; \
|
p1 = SE1->_permute; \
|
||||||
addr1 = l1 ? (uint64_t) &in._odata[o1] : (uint64_t) &buf[o1]; \
|
CONDITIONAL_MOVE(l1,o1,addr1); \
|
||||||
PF_CHI(addr1); \
|
PF_CHI(addr1); \
|
||||||
\
|
\
|
||||||
SE2=st.GetEntry(ptype,Z+skew,sF); \
|
SE2=st.GetEntry(ptype,Z+skew,sF); \
|
||||||
o2 = SE2->_offset; \
|
o2 = SE2->_offset; \
|
||||||
l2 = SE2->_is_local; \
|
l2 = SE2->_is_local; \
|
||||||
p2 = SE2->_permute; \
|
p2 = SE2->_permute; \
|
||||||
addr2 = l2 ? (uint64_t) &in._odata[o2] : (uint64_t) &buf[o2]; \
|
CONDITIONAL_MOVE(l2,o2,addr2); \
|
||||||
PF_CHI(addr2); \
|
PF_CHI(addr2); \
|
||||||
\
|
\
|
||||||
SE3=st.GetEntry(ptype,T+skew,sF); \
|
SE3=st.GetEntry(ptype,T+skew,sF); \
|
||||||
o3 = SE3->_offset; \
|
o3 = SE3->_offset; \
|
||||||
l3 = SE3->_is_local; \
|
l3 = SE3->_is_local; \
|
||||||
p3 = SE3->_permute; \
|
p3 = SE3->_permute; \
|
||||||
addr3 = l3 ? (uint64_t) &in._odata[o3] : (uint64_t) &buf[o3]; \
|
CONDITIONAL_MOVE(l3,o3,addr3); \
|
||||||
PF_CHI(addr3); \
|
PF_CHI(addr3); \
|
||||||
\
|
\
|
||||||
gauge0 =(uint64_t)&UU._odata[sU]( X ); \
|
gauge0 =(uint64_t)&UU._odata[sU]( X ); \
|
||||||
@ -602,12 +645,13 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
SiteSpinor *buf, int sF,
|
SiteSpinor *buf, int LLs,
|
||||||
int sU, const FermionField &in, SiteSpinor &out)
|
int sU, const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||||
uint64_t addr0,addr1,addr2,addr3;
|
uint64_t addr0,addr1,addr2,addr3;
|
||||||
|
const SiteSpinor *in_p; in_p = &in._odata[0];
|
||||||
|
|
||||||
int o0,o1,o2,o3; // offsets
|
int o0,o1,o2,o3; // offsets
|
||||||
int l0,l1,l2,l3; // local
|
int l0,l1,l2,l3; // local
|
||||||
@ -618,42 +662,46 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
|
|||||||
StencilEntry *SE2;
|
StencilEntry *SE2;
|
||||||
StencilEntry *SE3;
|
StencilEntry *SE3;
|
||||||
|
|
||||||
// Xp, Yp, Zp, Tp
|
for(int s=0;s<LLs;s++){
|
||||||
|
|
||||||
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
int sF=s+LLs*sU;
|
||||||
LOAD_CHI(addr0,addr1,addr2,addr3);
|
// Xp, Yp, Zp, Tp
|
||||||
MULT_LS(gauge0,gauge1,gauge2,gauge3);
|
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
||||||
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
|
MULT_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
|
|
||||||
PREPARE(Xm,Ym,Zm,Tm,0,U);
|
PREPARE(Xm,Ym,Zm,Tm,0,U);
|
||||||
LOAD_CHI(addr0,addr1,addr2,addr3);
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
|
|
||||||
PREPARE(Xp,Yp,Zp,Tp,8,UUU);
|
PREPARE(Xp,Yp,Zp,Tp,8,UUU);
|
||||||
LOAD_CHI(addr0,addr1,addr2,addr3);
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
|
|
||||||
PREPARE(Xm,Ym,Zm,Tm,8,UUU);
|
PREPARE(Xm,Ym,Zm,Tm,8,UUU);
|
||||||
LOAD_CHI(addr0,addr1,addr2,addr3);
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
|
|
||||||
addr0 = (uint64_t) &out;
|
addr0 = (uint64_t) &out._odata[sF];
|
||||||
REDUCE(addr0);
|
REDUCE(addr0);
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// This is the single precision 5th direction vectorised kernel
|
|
||||||
#include <simd/Intel512double.h>
|
#include <simd/Intel512double.h>
|
||||||
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
SiteSpinor *buf, int sF,
|
SiteSpinor *buf, int LLs,
|
||||||
int sU, const FermionField &in, SiteSpinor &out)
|
int sU, const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||||
uint64_t addr0,addr1,addr2,addr3;
|
uint64_t addr0,addr1,addr2,addr3;
|
||||||
|
const SiteSpinor *in_p; in_p = &in._odata[0];
|
||||||
|
|
||||||
int o0,o1,o2,o3; // offsets
|
int o0,o1,o2,o3; // offsets
|
||||||
int l0,l1,l2,l3; // local
|
int l0,l1,l2,l3; // local
|
||||||
@ -664,30 +712,34 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
|
|||||||
StencilEntry *SE2;
|
StencilEntry *SE2;
|
||||||
StencilEntry *SE3;
|
StencilEntry *SE3;
|
||||||
|
|
||||||
// Xp, Yp, Zp, Tp
|
for(int s=0;s<LLs;s++){
|
||||||
|
int sF=s+LLs*sU;
|
||||||
|
// Xp, Yp, Zp, Tp
|
||||||
|
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
||||||
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
|
MULT_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
|
|
||||||
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
PREPARE(Xm,Ym,Zm,Tm,0,U);
|
||||||
LOAD_CHI(addr0,addr1,addr2,addr3);
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
MULT_LS(gauge0,gauge1,gauge2,gauge3);
|
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
|
|
||||||
PREPARE(Xm,Ym,Zm,Tm,0,U);
|
PREPARE(Xp,Yp,Zp,Tp,8,UUU);
|
||||||
LOAD_CHI(addr0,addr1,addr2,addr3);
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
|
|
||||||
PREPARE(Xp,Yp,Zp,Tp,8,UUU);
|
PREPARE(Xm,Ym,Zm,Tm,8,UUU);
|
||||||
LOAD_CHI(addr0,addr1,addr2,addr3);
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
|
|
||||||
PREPARE(Xm,Ym,Zm,Tm,8,UUU);
|
addr0 = (uint64_t) &out._odata[sF];
|
||||||
LOAD_CHI(addr0,addr1,addr2,addr3);
|
REDUCE(addr0);
|
||||||
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
}
|
||||||
|
|
||||||
addr0 = (uint64_t) &out;
|
|
||||||
REDUCE(addr0);
|
|
||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#define PERMUTE_DIR3 __asm__ ( \
|
#define PERMUTE_DIR3 __asm__ ( \
|
||||||
@ -711,16 +763,18 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
|
|||||||
VPERM0(Chi_12,Chi_12) );
|
VPERM0(Chi_12,Chi_12) );
|
||||||
|
|
||||||
// This is the single precision 5th direction vectorised kernel
|
// This is the single precision 5th direction vectorised kernel
|
||||||
|
|
||||||
#include <simd/Intel512single.h>
|
#include <simd/Intel512single.h>
|
||||||
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
SiteSpinor *buf, int sF,
|
SiteSpinor *buf, int LLs,
|
||||||
int sU, const FermionField &in, SiteSpinor &out)
|
int sU, const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||||
uint64_t addr0,addr1,addr2,addr3;
|
uint64_t addr0,addr1,addr2,addr3;
|
||||||
|
const SiteSpinor *in_p; in_p = &in._odata[0];
|
||||||
|
|
||||||
int o0,o1,o2,o3; // offsets
|
int o0,o1,o2,o3; // offsets
|
||||||
int l0,l1,l2,l3; // local
|
int l0,l1,l2,l3; // local
|
||||||
@ -731,66 +785,46 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
|
|||||||
StencilEntry *SE2;
|
StencilEntry *SE2;
|
||||||
StencilEntry *SE3;
|
StencilEntry *SE3;
|
||||||
|
|
||||||
// Xp, Yp, Zp, Tp
|
for(int s=0;s<LLs;s++){
|
||||||
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
|
||||||
LOAD_CHIa(addr0,addr1);
|
|
||||||
if (l0&&p0) { PERMUTE_DIR3; }
|
|
||||||
if (l1&&p1) { PERMUTE_DIR2; }
|
|
||||||
MULT_XYZT(gauge0,gauge1);
|
|
||||||
LOAD_CHIa(addr2,addr3);
|
|
||||||
if (l2&&p2) { PERMUTE_DIR1; }
|
|
||||||
if (l3&&p3) { PERMUTE_DIR0; }
|
|
||||||
MULT_ADD_XYZT(gauge2,gauge3);
|
|
||||||
|
|
||||||
PREPARE(Xm,Ym,Zm,Tm,0,U);
|
int sF=s+LLs*sU;
|
||||||
LOAD_CHIa(addr0,addr1);
|
// Xp, Yp, Zp, Tp
|
||||||
if (l0&&p0) { PERMUTE_DIR3; }
|
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
||||||
if (l1&&p1) { PERMUTE_DIR2; }
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
MULT_ADD_XYZT(gauge0,gauge1);
|
MULT_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
LOAD_CHIa(addr2,addr3);
|
|
||||||
if (l2&&p2) { PERMUTE_DIR1; }
|
|
||||||
if (l3&&p3) { PERMUTE_DIR0; }
|
|
||||||
MULT_ADD_XYZT(gauge2,gauge3);
|
|
||||||
|
|
||||||
PREPARE(Xp,Yp,Zp,Tp,8,UUU);
|
PREPARE(Xm,Ym,Zm,Tm,0,U);
|
||||||
LOAD_CHIa(addr0,addr1);
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
if (l0&&p0) { PERMUTE_DIR3; }
|
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
if (l1&&p1) { PERMUTE_DIR2; }
|
|
||||||
MULT_ADD_XYZT(gauge0,gauge1);
|
|
||||||
LOAD_CHIa(addr2,addr3);
|
|
||||||
if (l2&&p2) { PERMUTE_DIR1; }
|
|
||||||
if (l3&&p3) { PERMUTE_DIR0; }
|
|
||||||
MULT_ADD_XYZT(gauge2,gauge3);
|
|
||||||
|
|
||||||
PREPARE(Xm,Ym,Zm,Tm,8,UUU);
|
PREPARE(Xp,Yp,Zp,Tp,8,UUU);
|
||||||
LOAD_CHIa(addr0,addr1);
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
if (l0&&p0) { PERMUTE_DIR3; }
|
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
if (l1&&p1) { PERMUTE_DIR2; }
|
|
||||||
MULT_ADD_XYZT(gauge0,gauge1);
|
|
||||||
LOAD_CHIa(addr2,addr3);
|
|
||||||
if (l2&&p2) { PERMUTE_DIR1; }
|
|
||||||
if (l3&&p3) { PERMUTE_DIR0; }
|
|
||||||
MULT_ADD_XYZT(gauge2,gauge3);
|
|
||||||
|
|
||||||
addr0 = (uint64_t) &out;
|
PREPARE(Xm,Ym,Zm,Tm,8,UUU);
|
||||||
REDUCEa(addr0);
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
|
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
|
|
||||||
|
addr0 = (uint64_t) &out._odata[sF];
|
||||||
|
REDUCE(addr0);
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// This is the single precision 5th direction vectorised kernel
|
|
||||||
#include <simd/Intel512double.h>
|
#include <simd/Intel512double.h>
|
||||||
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
SiteSpinor *buf, int sF,
|
SiteSpinor *buf, int LLs,
|
||||||
int sU, const FermionField &in, SiteSpinor &out)
|
int sU, const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||||
uint64_t addr0,addr1,addr2,addr3;
|
uint64_t addr0,addr1,addr2,addr3;
|
||||||
|
const SiteSpinor *in_p; in_p = &in._odata[0];
|
||||||
|
|
||||||
int o0,o1,o2,o3; // offsets
|
int o0,o1,o2,o3; // offsets
|
||||||
int l0,l1,l2,l3; // local
|
int l0,l1,l2,l3; // local
|
||||||
@ -801,57 +835,35 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
|
|||||||
StencilEntry *SE2;
|
StencilEntry *SE2;
|
||||||
StencilEntry *SE3;
|
StencilEntry *SE3;
|
||||||
|
|
||||||
// Xp, Yp, Zp, Tp
|
for(int s=0;s<LLs;s++){
|
||||||
|
|
||||||
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
int sF=s+LLs*sU;
|
||||||
LOAD_CHIa(addr0,addr1);
|
// Xp, Yp, Zp, Tp
|
||||||
if (p0) { PERMUTE_DIR3; }
|
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
||||||
if (p1) { PERMUTE_DIR2; }
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
MULT_XYZT(gauge0,gauge1);
|
MULT_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
LOAD_CHIa(addr2,addr3);
|
|
||||||
if (p2) { PERMUTE_DIR1; }
|
|
||||||
if (p3) { PERMUTE_DIR0; }
|
|
||||||
MULT_ADD_XYZT(gauge2,gauge3);
|
|
||||||
|
|
||||||
PREPARE(Xm,Ym,Zm,Tm,0,U);
|
PREPARE(Xm,Ym,Zm,Tm,0,U);
|
||||||
LOAD_CHIa(addr0,addr1);
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
if (p0) { PERMUTE_DIR3; }
|
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
if (p1) { PERMUTE_DIR2; }
|
|
||||||
MULT_ADD_XYZT(gauge0,gauge1);
|
|
||||||
LOAD_CHIa(addr2,addr3);
|
|
||||||
if (p2) { PERMUTE_DIR1; }
|
|
||||||
if (p3) { PERMUTE_DIR0; }
|
|
||||||
MULT_ADD_XYZT(gauge2,gauge3);
|
|
||||||
|
|
||||||
PREPARE(Xp,Yp,Zp,Tp,8,UUU);
|
PREPARE(Xp,Yp,Zp,Tp,8,UUU);
|
||||||
LOAD_CHIa(addr0,addr1);
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
if (p0) { PERMUTE_DIR3; }
|
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
if (p1) { PERMUTE_DIR2; }
|
|
||||||
MULT_ADD_XYZT(gauge0,gauge1);
|
|
||||||
LOAD_CHIa(addr2,addr3);
|
|
||||||
if (p2) { PERMUTE_DIR1; }
|
|
||||||
if (p3) { PERMUTE_DIR0; }
|
|
||||||
MULT_ADD_XYZT(gauge2,gauge3);
|
|
||||||
|
|
||||||
PREPARE(Xm,Ym,Zm,Tm,8,UUU);
|
PREPARE(Xm,Ym,Zm,Tm,8,UUU);
|
||||||
LOAD_CHIa(addr0,addr1);
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
if (p0) { PERMUTE_DIR3; }
|
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
if (p1) { PERMUTE_DIR2; }
|
|
||||||
MULT_ADD_XYZT(gauge0,gauge1);
|
|
||||||
LOAD_CHIa(addr2,addr3);
|
|
||||||
if (p2) { PERMUTE_DIR1; }
|
|
||||||
if (p3) { PERMUTE_DIR0; }
|
|
||||||
MULT_ADD_XYZT(gauge2,gauge3);
|
|
||||||
|
|
||||||
addr0 = (uint64_t) &out;
|
addr0 = (uint64_t) &out._odata[sF];
|
||||||
REDUCEa(addr0);
|
REDUCE(addr0);
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
FermOpStaggeredTemplateInstantiate(StaggeredKernels);
|
FermOpStaggeredTemplateInstantiate(StaggeredKernels);
|
||||||
FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
|
FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
|
||||||
|
|
||||||
|
@ -91,7 +91,32 @@ namespace QCD {
|
|||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
void StaggeredKernels<Impl>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor *buf, int LLs,
|
||||||
|
int sU, const FermionField &in, FermionField &out, int dag) {
|
||||||
|
|
||||||
|
SiteSpinor naik;
|
||||||
|
SiteSpinor naive;
|
||||||
|
int oneLink =0;
|
||||||
|
int threeLink=1;
|
||||||
|
int skew(0);
|
||||||
|
Real scale(1.0);
|
||||||
|
|
||||||
|
if(dag) scale = -1.0;
|
||||||
|
|
||||||
|
for(int s=0;s<LLs;s++){
|
||||||
|
|
||||||
|
int sF=s+LLs*sU;
|
||||||
|
DhopSiteDepthHandLocal(st,lo,U,buf,sF,sU,in,naive,oneLink);
|
||||||
|
DhopSiteDepthHandLocal(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
|
||||||
|
out._odata[sF] =scale*(naive+naik);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void StaggeredKernels<Impl>::DhopSiteDepthHandLocal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
SiteSpinor *buf, int sF,
|
SiteSpinor *buf, int sF,
|
||||||
int sU, const FermionField &in, SiteSpinor &out,int threeLink) {
|
int sU, const FermionField &in, SiteSpinor &out,int threeLink) {
|
||||||
{
|
{
|
||||||
|
@ -153,7 +153,7 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
|
std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Calling Ds"<<std::endl;
|
std::cout<<GridLogMessage << "Calling Ds"<<std::endl;
|
||||||
int ncall=1000;
|
int ncall=100000;
|
||||||
double t0=usecond();
|
double t0=usecond();
|
||||||
for(int i=0;i<ncall;i++){
|
for(int i=0;i<ncall;i++){
|
||||||
Ds.Dhop(src,result,0);
|
Ds.Dhop(src,result,0);
|
||||||
|
@ -57,18 +57,19 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||||
|
|
||||||
std::vector<int> seeds({1,2,3,4});
|
std::vector<int> seeds({1,2,3,4});
|
||||||
|
/*
|
||||||
GridParallelRNG pRNG4(UGrid);
|
GridParallelRNG pRNG4(UGrid);
|
||||||
GridParallelRNG pRNG5(FGrid);
|
GridParallelRNG pRNG5(FGrid);
|
||||||
pRNG4.SeedFixedIntegers(seeds);
|
pRNG4.SeedFixedIntegers(seeds);
|
||||||
pRNG5.SeedFixedIntegers(seeds);
|
pRNG5.SeedFixedIntegers(seeds);
|
||||||
|
*/
|
||||||
typedef typename ImprovedStaggeredFermion5DR::FermionField FermionField;
|
typedef typename ImprovedStaggeredFermion5DR::FermionField FermionField;
|
||||||
typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField;
|
typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField;
|
||||||
typename ImprovedStaggeredFermion5DR::ImplParams params;
|
typename ImprovedStaggeredFermion5DR::ImplParams params;
|
||||||
|
|
||||||
FermionField src (FGrid);
|
FermionField src (FGrid); src=zero;
|
||||||
|
|
||||||
random(pRNG5,src);
|
// random(pRNG5,src);
|
||||||
/*
|
/*
|
||||||
std::vector<int> site({0,0,0,0,0});
|
std::vector<int> site({0,0,0,0,0});
|
||||||
ColourVector cv = zero;
|
ColourVector cv = zero;
|
||||||
@ -80,10 +81,10 @@ int main (int argc, char ** argv)
|
|||||||
FermionField result(FGrid); result=zero;
|
FermionField result(FGrid); result=zero;
|
||||||
FermionField tmp(FGrid); tmp=zero;
|
FermionField tmp(FGrid); tmp=zero;
|
||||||
FermionField err(FGrid); tmp=zero;
|
FermionField err(FGrid); tmp=zero;
|
||||||
FermionField phi (FGrid); random(pRNG5,phi);
|
FermionField phi (FGrid); phi=1.0;//random(pRNG5,phi);
|
||||||
FermionField chi (FGrid); random(pRNG5,chi);
|
FermionField chi (FGrid); chi=1.0;//random(pRNG5,chi);
|
||||||
|
|
||||||
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG4,Umu);
|
LatticeGaugeField Umu(UGrid); Umu=1.0; //SU3::HotConfiguration(pRNG4,Umu);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
for(int mu=1;mu<4;mu++){
|
for(int mu=1;mu<4;mu++){
|
||||||
@ -109,15 +110,18 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage<<"= Testing Dhop against cshift implementation "<<std::endl;
|
std::cout<<GridLogMessage<<"= Testing Dhop against cshift implementation "<<std::endl;
|
||||||
std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
|
std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Calling staggered operator"<<std::endl;
|
|
||||||
int ncall=1000;
|
int ncall=1000;
|
||||||
double t0=usecond();
|
int ncall1=1000;
|
||||||
for(int i=0;i<ncall;i++){
|
double t0(0),t1(0);
|
||||||
|
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Calling staggered operator"<<std::endl;
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<ncall1;i++){
|
||||||
Ds.Dhop(src,result,0);
|
Ds.Dhop(src,result,0);
|
||||||
}
|
}
|
||||||
double t1=usecond();
|
t1=usecond();
|
||||||
|
|
||||||
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called Ds"<<std::endl;
|
std::cout<<GridLogMessage << "Called Ds"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||||
@ -128,7 +132,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptInlineAsm;
|
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptInlineAsm;
|
||||||
t0=usecond();
|
t0=usecond();
|
||||||
for(int i=0;i<ncall;i++){
|
for(int i=0;i<ncall1;i++){
|
||||||
Ds.Dhop(src,tmp,0);
|
Ds.Dhop(src,tmp,0);
|
||||||
}
|
}
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
@ -139,19 +143,20 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
err = tmp-result;
|
err = tmp-result;
|
||||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
FermionField ssrc (sFGrid); localConvert(src,ssrc);
|
FermionField ssrc (sFGrid); localConvert(src,ssrc);
|
||||||
FermionField sresult(sFGrid); sresult=zero;
|
FermionField sresult(sFGrid); sresult=zero;
|
||||||
|
|
||||||
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptHandUnroll;
|
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptHandUnroll;
|
||||||
t0=usecond();
|
t0=usecond();
|
||||||
for(int i=0;i<ncall;i++){
|
for(int i=0;i<ncall1;i++){
|
||||||
sDs.Dhop(ssrc,sresult,0);
|
sDs.Dhop(ssrc,sresult,0);
|
||||||
}
|
}
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
localConvert(sresult,tmp);
|
localConvert(sresult,tmp);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called sDs"<<std::endl;
|
std::cout<<GridLogMessage << "Called sDs unroll"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
|
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
|
||||||
@ -160,9 +165,9 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
err = tmp-result;
|
err = tmp-result;
|
||||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||||
|
int extra=1;
|
||||||
t0=usecond();
|
t0=usecond();
|
||||||
for(int i=0;i<ncall;i++){
|
for(int i=0;i<ncall1*extra;i++){
|
||||||
sDs.Dhop(ssrc,sresult,0);
|
sDs.Dhop(ssrc,sresult,0);
|
||||||
}
|
}
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
@ -170,11 +175,12 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
std::cout<<GridLogMessage << "Called sDs asm"<<std::endl;
|
std::cout<<GridLogMessage << "Called sDs asm"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
|
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)*extra<<std::endl;
|
||||||
|
|
||||||
err = tmp-result;
|
err = tmp-result;
|
||||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user