1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-25 13:15:55 +01:00
This commit is contained in:
azusayamaguchi 2017-02-21 23:01:25 +00:00
parent bf7e3f20d4
commit 1c30e9a961
8 changed files with 163 additions and 153 deletions

View File

@ -115,7 +115,7 @@ int main (int argc, char ** argv)
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params); ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params);
std::cout<<GridLogMessage << "Calling Ds"<<std::endl; std::cout<<GridLogMessage << "Calling Ds"<<std::endl;
int ncall=100000; int ncall=1000;
double t0=usecond(); double t0=usecond();
for(int i=0;i<ncall;i++){ for(int i=0;i<ncall;i++){
Ds.Dhop(src,result,0); Ds.Dhop(src,result,0);

View File

@ -338,12 +338,12 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
if (dag == DaggerYes) { if (dag == DaggerYes) {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for (int sss = 0; sss < in._grid->oSites(); sss++) { for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), sss, sss, in, out); Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
} }
} else { } else {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for (int sss = 0; sss < in._grid->oSites(); sss++) { for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), sss, sss, in, out); Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
} }
} }
}; };

View File

@ -228,9 +228,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
const FermionField &in, FermionField &out,int dag) const FermionField &in, FermionField &out,int dag)
{ {
Compressor compressor; Compressor compressor;
int LLs = in._grid->_rdimensions[0]; int LLs = in._grid->_rdimensions[0];
st.HaloExchange(in,compressor); st.HaloExchange(in,compressor);
// Dhop takes the 4d grid from U, and makes a 5d index for fermion // Dhop takes the 4d grid from U, and makes a 5d index for fermion
@ -241,28 +239,11 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out); Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out);
} }
} else { } else {
#if 1
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for (int ss = 0; ss < U._grid->oSites(); ss++) { for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU=ss; int sU=ss;
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out); Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
} }
#else
#pragma omp parallel
{
for(int i=0;i<10;i++){
int len = U._grid->oSites();
int me,mywork,myoff;
GridThread::GetWorkBarrier(len,me, mywork,myoff);
for (int ss = myoff; ss < myoff+mywork; ss++) {
int sU=ss;
int sF=LLs*sU;
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
}
GridThread::ThreadBarrier();
}
}
#endif
} }
} }

View File

@ -186,32 +186,31 @@ template <class Impl>
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU, SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out) { const FermionField &in, FermionField &out) {
int dag(1);
SiteSpinor naik; SiteSpinor naik;
SiteSpinor naive; SiteSpinor naive;
int oneLink =0; int oneLink =0;
int threeLink=1; int threeLink=1;
Real scale; int dag=1;
if(dag) scale = -1.0;
else scale = 1.0;
switch(Opt) { switch(Opt) {
#ifdef AVX512 #ifdef AVX512
//FIXME; move the sign into the Asm routine
case OptInlineAsm: case OptInlineAsm:
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out); DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out);
for(int s=0;s<LLs;s++) {
int sF=s+LLs*sU;
out._odata[sF]=-out._odata[sF];
}
break; break;
#endif #endif
case OptHandUnroll: case OptHandUnroll:
DhopSiteDepthHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag); DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
break; break;
case OptGeneric: case OptGeneric:
for(int s=0;s<LLs;s++){ for(int s=0;s<LLs;s++){
int sF=s+LLs*sU; int sF=s+LLs*sU;
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink); DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink); DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
out._odata[sF] =scale*(naive+naik); out._odata[sF] =-naive-naik;
} }
break; break;
default: default:
@ -223,17 +222,13 @@ void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, Dou
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, SiteSpinor *buf, int LLs,
int sU, const FermionField &in, FermionField &out) { int sU, const FermionField &in, FermionField &out)
{
int dag(0);
int oneLink =0; int oneLink =0;
int threeLink=1; int threeLink=1;
SiteSpinor naik; SiteSpinor naik;
SiteSpinor naive; SiteSpinor naive;
static int once; int dag=0;
int sF=LLs*sU;
switch(Opt) { switch(Opt) {
#ifdef AVX512 #ifdef AVX512
case OptInlineAsm: case OptInlineAsm:
@ -241,22 +236,23 @@ void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, Double
break; break;
#endif #endif
case OptHandUnroll: case OptHandUnroll:
DhopSiteDepthHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag); DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
break; break;
case OptGeneric: case OptGeneric:
for(int s=0;s<LLs;s++){ for(int s=0;s<LLs;s++){
int sF=LLs*sU+s;
// assert(sF<in._odata.size());
// assert(sU< U._odata.size());
// assert(sF>=0); assert(sU>=0);
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink); DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink); DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
out._odata[sF] =naive+naik; out._odata[sF] =naive+naik;
} }
break; break;
default: default:
assert(0); assert(0);
break; break;
} }
}; };
template <class Impl> template <class Impl>

View File

@ -57,11 +57,11 @@ public:
int sF, int sU, const FermionField &in, SiteSpinor &out,int threeLink); int sF, int sU, const FermionField &in, SiteSpinor &out,int threeLink);
void DhopSiteDepthHandLocal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf, void DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf,
int sF, int sU, const FermionField &in, SiteSpinor&out,int threeLink); int sF, int sU, const FermionField &in, SiteSpinor&out,int threeLink);
void DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,SiteSpinor * buf, void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,SiteSpinor * buf,
int Lls, int sU, const FermionField &in, FermionField &out, int dag); int LLs, int sU, const FermionField &in, FermionField &out, int dag);
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, SiteSpinor * buf, void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, SiteSpinor * buf,
int LLs, int sU, const FermionField &in, FermionField &out); int LLs, int sU, const FermionField &in, FermionField &out);

View File

@ -762,6 +762,14 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
VPERM0(Chi_11,Chi_11) \ VPERM0(Chi_11,Chi_11) \
VPERM0(Chi_12,Chi_12) ); VPERM0(Chi_12,Chi_12) );
#define PERMUTE01 \
if ( p0 ) { PERMUTE_DIR3; }\
if ( p1 ) { PERMUTE_DIR2; }
#define PERMUTE23 \
if ( p2 ) { PERMUTE_DIR1; }\
if ( p3 ) { PERMUTE_DIR0; }
// This is the single precision 5th direction vectorised kernel // This is the single precision 5th direction vectorised kernel
#include <simd/Intel512single.h> #include <simd/Intel512single.h>
@ -790,30 +798,45 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
int sF=s+LLs*sU; int sF=s+LLs*sU;
// Xp, Yp, Zp, Tp // Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U); PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHI(addr0,addr1,addr2,addr3); LOAD_CHIa(addr0,addr1);
MULT_LS(gauge0,gauge1,gauge2,gauge3); PERMUTE01;
MULT_XYZT(gauge0,gauge1);
LOAD_CHIa(addr2,addr3);
PERMUTE23;
MULT_ADD_XYZT(gauge2,gauge3);
PREPARE(Xm,Ym,Zm,Tm,0,U); PREPARE(Xm,Ym,Zm,Tm,0,U);
LOAD_CHI(addr0,addr1,addr2,addr3); LOAD_CHIa(addr0,addr1);
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3); PERMUTE01;
MULT_ADD_XYZT(gauge0,gauge1);
LOAD_CHIa(addr2,addr3);
PERMUTE23;
MULT_ADD_XYZT(gauge2,gauge3);
PREPARE(Xp,Yp,Zp,Tp,8,UUU); PREPARE(Xp,Yp,Zp,Tp,8,UUU);
LOAD_CHI(addr0,addr1,addr2,addr3); LOAD_CHIa(addr0,addr1);
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3); PERMUTE01;
MULT_ADD_XYZT(gauge0,gauge1);
LOAD_CHIa(addr2,addr3);
PERMUTE23;
MULT_ADD_XYZT(gauge2,gauge3);
PREPARE(Xm,Ym,Zm,Tm,8,UUU); PREPARE(Xm,Ym,Zm,Tm,8,UUU);
LOAD_CHI(addr0,addr1,addr2,addr3); LOAD_CHIa(addr0,addr1);
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3); PERMUTE01;
MULT_ADD_XYZT(gauge0,gauge1);
LOAD_CHIa(addr2,addr3);
PERMUTE23;
MULT_ADD_XYZT(gauge2,gauge3);
addr0 = (uint64_t) &out._odata[sF]; addr0 = (uint64_t) &out._odata[sF];
REDUCE(addr0); REDUCEa(addr0);
} }
#else #else
assert(0); assert(0);
#endif #endif
} }
#include <simd/Intel512double.h> #include <simd/Intel512double.h>
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &U,
@ -840,23 +863,39 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
int sF=s+LLs*sU; int sF=s+LLs*sU;
// Xp, Yp, Zp, Tp // Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U); PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHI(addr0,addr1,addr2,addr3); LOAD_CHIa(addr0,addr1);
MULT_LS(gauge0,gauge1,gauge2,gauge3); PERMUTE01;
MULT_XYZT(gauge0,gauge1);
LOAD_CHIa(addr2,addr3);
PERMUTE23;
MULT_ADD_XYZT(gauge2,gauge3);
PREPARE(Xm,Ym,Zm,Tm,0,U); PREPARE(Xm,Ym,Zm,Tm,0,U);
LOAD_CHI(addr0,addr1,addr2,addr3); LOAD_CHIa(addr0,addr1);
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3); PERMUTE01;
MULT_ADD_XYZT(gauge0,gauge1);
LOAD_CHIa(addr2,addr3);
PERMUTE23;
MULT_ADD_XYZT(gauge2,gauge3);
PREPARE(Xp,Yp,Zp,Tp,8,UUU); PREPARE(Xp,Yp,Zp,Tp,8,UUU);
LOAD_CHI(addr0,addr1,addr2,addr3); LOAD_CHIa(addr0,addr1);
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3); PERMUTE01;
MULT_ADD_XYZT(gauge0,gauge1);
LOAD_CHIa(addr2,addr3);
PERMUTE23;
MULT_ADD_XYZT(gauge2,gauge3);
PREPARE(Xm,Ym,Zm,Tm,8,UUU); PREPARE(Xm,Ym,Zm,Tm,8,UUU);
LOAD_CHI(addr0,addr1,addr2,addr3); LOAD_CHIa(addr0,addr1);
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3); PERMUTE01;
MULT_ADD_XYZT(gauge0,gauge1);
LOAD_CHIa(addr2,addr3);
PERMUTE23;
MULT_ADD_XYZT(gauge2,gauge3);
addr0 = (uint64_t) &out._odata[sF]; addr0 = (uint64_t) &out._odata[sF];
REDUCE(addr0); REDUCEa(addr0);
} }
#else #else
assert(0); assert(0);

View File

@ -91,10 +91,10 @@ namespace QCD {
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, SiteSpinor *buf, int LLs,
int sU, const FermionField &in, FermionField &out, int dag) { int sU, const FermionField &in, FermionField &out, int dag)
{
SiteSpinor naik; SiteSpinor naik;
SiteSpinor naive; SiteSpinor naive;
int oneLink =0; int oneLink =0;
@ -105,20 +105,17 @@ void StaggeredKernels<Impl>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &l
if(dag) scale = -1.0; if(dag) scale = -1.0;
for(int s=0;s<LLs;s++){ for(int s=0;s<LLs;s++){
int sF=s+LLs*sU; int sF=s+LLs*sU;
DhopSiteDepthHandLocal(st,lo,U,buf,sF,sU,in,naive,oneLink); DhopSiteDepthHand(st,lo,U,buf,sF,sU,in,naive,oneLink);
DhopSiteDepthHandLocal(st,lo,UUU,buf,sF,sU,in,naik,threeLink); DhopSiteDepthHand(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
out._odata[sF] =scale*(naive+naik); out._odata[sF] =scale*(naive+naik);
} }
} }
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteDepthHandLocal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, void StaggeredKernels<Impl>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
SiteSpinor *buf, int sF, SiteSpinor *buf, int sF,
int sU, const FermionField &in, SiteSpinor &out,int threeLink) { int sU, const FermionField &in, SiteSpinor &out,int threeLink)
{ {
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V; typedef typename Simd::vector_type V;
@ -301,7 +298,6 @@ void StaggeredKernels<Impl>::DhopSiteDepthHandLocal(StencilImpl &st, LebesgueOrd
vstream(out()()(2),even_2+odd_2); vstream(out()()(2),even_2+odd_2);
} }
}
FermOpStaggeredTemplateInstantiate(StaggeredKernels); FermOpStaggeredTemplateInstantiate(StaggeredKernels);
FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels); FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);

View File

@ -57,34 +57,33 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::vector<int> seeds({1,2,3,4}); std::vector<int> seeds({1,2,3,4});
/*
GridParallelRNG pRNG4(UGrid); GridParallelRNG pRNG4(UGrid);
GridParallelRNG pRNG5(FGrid); GridParallelRNG pRNG5(FGrid);
pRNG4.SeedFixedIntegers(seeds); pRNG4.SeedFixedIntegers(seeds);
pRNG5.SeedFixedIntegers(seeds); pRNG5.SeedFixedIntegers(seeds);
*/
typedef typename ImprovedStaggeredFermion5DR::FermionField FermionField; typedef typename ImprovedStaggeredFermion5DR::FermionField FermionField;
typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField; typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField;
typename ImprovedStaggeredFermion5DR::ImplParams params; typename ImprovedStaggeredFermion5DR::ImplParams params;
FermionField src (FGrid); src=zero; FermionField src (FGrid);
random(pRNG5,src);
// random(pRNG5,src);
/* /*
std::vector<int> site({0,0,0,0,0}); std::vector<int> site({0,1,2,0,0});
ColourVector cv = zero; ColourVector cv = zero;
cv()()(0)=1.0; cv()()(0)=1.0;
src = zero; src = zero;
pokeSite(cv,src,site); pokeSite(cv,src,site);
*/ */
FermionField result(FGrid); result=zero; FermionField result(FGrid); result=zero;
FermionField tmp(FGrid); tmp=zero; FermionField tmp(FGrid); tmp=zero;
FermionField err(FGrid); tmp=zero; FermionField err(FGrid); tmp=zero;
FermionField phi (FGrid); phi=1.0;//random(pRNG5,phi); FermionField phi (FGrid); random(pRNG5,phi);
FermionField chi (FGrid); chi=1.0;//random(pRNG5,chi); FermionField chi (FGrid); random(pRNG5,chi);
LatticeGaugeField Umu(UGrid); Umu=1.0; //SU3::HotConfiguration(pRNG4,Umu); LatticeGaugeField Umu(UGrid);
SU3::HotConfiguration(pRNG4,Umu);
/* /*
for(int mu=1;mu<4;mu++){ for(int mu=1;mu<4;mu++){
@ -127,7 +126,6 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl; std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "Calling vectorised staggered operator"<<std::endl; std::cout<<GridLogMessage << "Calling vectorised staggered operator"<<std::endl;
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptInlineAsm; QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptInlineAsm;