From 24162c9eadb6316f2125c2430c51440969cd2310 Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Tue, 9 Jan 2018 13:02:52 +0000 Subject: [PATCH] Staggered overlap comms comput --- .../fermion/ImprovedStaggeredFermion5D.cc | 164 ++++++ .../fermion/ImprovedStaggeredFermion5D.h | 19 + lib/qcd/action/fermion/StaggeredKernels.cc | 389 +++++++------- lib/qcd/action/fermion/StaggeredKernels.h | 79 ++- lib/qcd/action/fermion/StaggeredKernelsAsm.cc | 106 +++- .../action/fermion/StaggeredKernelsHand.cc | 497 ++++++++++-------- lib/qcd/action/fermion/WilsonCompressor.h | 59 +-- lib/qcd/action/fermion/WilsonFermion5D.cc | 3 +- lib/stencil/Stencil.h | 40 +- lib/util/Init.cc | 2 + 10 files changed, 869 insertions(+), 489 deletions(-) diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc index 7d988d89..2df4c044 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc @@ -124,6 +124,15 @@ ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GaugeField &_Uthin, // Allocate the required comms buffer ImportGauge(_Uthin,_Ufat); + + int LLs = FiveDimGrid._rdimensions[0]; + int vol4= FourDimGrid.oSites(); + Stencil.BuildSurfaceList(LLs,vol4); + + vol4=FourDimRedBlackGrid.oSites(); + StencilEven.BuildSurfaceList(LLs,vol4); + StencilOdd.BuildSurfaceList(LLs,vol4); + } template @@ -223,6 +232,157 @@ void ImprovedStaggeredFermion5D::DhopDerivOE(GaugeField &mat, assert(0); } +/*CHANGE */ +template +void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, + DoubledGaugeField & U,DoubledGaugeField & UUU, + const FermionField &in, FermionField &out,int dag) +{ + DhopTotalTime-=usecond(); +#ifdef GRID_OMP + if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) + DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); + else +#endif + DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); + DhopTotalTime+=usecond(); +} + +template +void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo, + DoubledGaugeField & U,DoubledGaugeField & UUU, + const FermionField &in, FermionField &out,int dag) +{ +#ifdef GRID_OMP + // assert((dag==DaggerNo) ||(dag==DaggerYes)); + + Compressor compressor; + + int LLs = in._grid->_rdimensions[0]; + int len = U._grid->oSites(); + + DhopFaceTime-=usecond(); + st.Prepare(); + st.HaloGather(in,compressor); + // st.HaloExchangeOptGather(in,compressor); // Wilson compressor + st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms + DhopFaceTime+=usecond(); + + double ctime=0; + double ptime=0; + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // Ugly explicit thread mapping introduced for OPA reasons. + ////////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma omp parallel reduction(max:ctime) reduction(max:ptime) + { + int tid = omp_get_thread_num(); + int nthreads = omp_get_num_threads(); + int ncomms = CartesianCommunicator::nCommThreads; + if (ncomms == -1) ncomms = 1; + assert(nthreads > ncomms); + if (tid >= ncomms) { + double start = usecond(); + nthreads -= ncomms; + int ttid = tid - ncomms; + int n = U._grid->oSites(); // 4d vol + int chunk = n / nthreads; + int rem = n % nthreads; + int myblock, myn; + if (ttid < rem) { + myblock = ttid * chunk + ttid; + myn = chunk+1; + } else { + myblock = ttid*chunk + rem; + myn = chunk; + } + + // do the compute + if (dag == DaggerYes) { + for (int ss = myblock; ss < myblock+myn; ++ss) { + int sU = ss; + // Interior = 1; Exterior = 0; must implement for staggered + Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<--------- + } + } else { + for (int ss = myblock; ss < myblock+myn; ++ss) { + // Interior = 1; Exterior = 0; + int sU = ss; + Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<------------ + } + } + ptime = usecond() - start; + } else { + double start = usecond(); + st.CommunicateThreaded(); + ctime = usecond() - start; + } + } + DhopCommTime += ctime; + DhopComputeTime+=ptime; + + // First to enter, last to leave timing + st.CollateThreads(); + + DhopFaceTime-=usecond(); + st.CommsMerge(compressor); + DhopFaceTime+=usecond(); + + DhopComputeTime2-=usecond(); + if (dag == DaggerYes) { + int sz=st.surface_list.size(); + parallel_for (int ss = 0; ss < sz; ss++) { + int sU = st.surface_list[ss]; + Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1); //<---------- + } + } else { + int sz=st.surface_list.size(); + parallel_for (int ss = 0; ss < sz; ss++) { + int sU = st.surface_list[ss]; + Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1);//<---------- + } + } + DhopComputeTime2+=usecond(); +#else + assert(0); +#endif + +} + +template +void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, + DoubledGaugeField & U,DoubledGaugeField & UUU, + const FermionField &in, FermionField &out,int dag) +{ + Compressor compressor; + int LLs = in._grid->_rdimensions[0]; + + + + DhopTotalTime -= usecond(); + DhopCommTime -= usecond(); + st.HaloExchange(in,compressor); + DhopCommTime += usecond(); + + DhopComputeTime -= usecond(); + // Dhop takes the 4d grid from U, and makes a 5d index for fermion + if (dag == DaggerYes) { + parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { + int sU=ss; + Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out); + } + } else { + parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { + int sU=ss; + Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out); + } + } + DhopComputeTime += usecond(); + DhopTotalTime += usecond(); +} +/*CHANGE END*/ + +/* ORG template void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U,DoubledGaugeField & UUU, @@ -254,6 +414,7 @@ void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOr DhopComputeTime += usecond(); DhopTotalTime += usecond(); } +*/ template @@ -336,6 +497,9 @@ void ImprovedStaggeredFermion5D::ZeroCounters(void) DhopTotalTime = 0; DhopCommTime = 0; DhopComputeTime = 0; + DhopFaceTime = 0; + + Stencil.ZeroCounters(); StencilEven.ZeroCounters(); StencilOdd.ZeroCounters(); diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h index ca1a955a..e21142b8 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -64,6 +64,8 @@ namespace QCD { double DhopCalls; double DhopCommTime; double DhopComputeTime; + double DhopComputeTime2; + double DhopFaceTime; /////////////////////////////////////////////////////////////// // Implement the abstract base @@ -119,6 +121,23 @@ namespace QCD { FermionField &out, int dag); + void DhopInternalOverlappedComms(StencilImpl & st, + LebesgueOrder &lo, + DoubledGaugeField &U, + DoubledGaugeField &UUU, + const FermionField &in, + FermionField &out, + int dag); + + void DhopInternalSerialComms(StencilImpl & st, + LebesgueOrder &lo, + DoubledGaugeField &U, + DoubledGaugeField &UUU, + const FermionField &in, + FermionField &out, + int dag); + + // Constructors ImprovedStaggeredFermion5D(GaugeField &_Uthin, GaugeField &_Ufat, diff --git a/lib/qcd/action/fermion/StaggeredKernels.cc b/lib/qcd/action/fermion/StaggeredKernels.cc index b6ec14c7..ea44cdd8 100644 --- a/lib/qcd/action/fermion/StaggeredKernels.cc +++ b/lib/qcd/action/fermion/StaggeredKernels.cc @@ -32,223 +32,242 @@ namespace Grid { namespace QCD { int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric; +int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute; + +#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \ + SE = st.GetEntry(ptype, Dir+skew, sF); \ + if (SE->_is_local ) { \ + if (SE->_permute) { \ + chi_p = χ \ + permute(chi, in._odata[SE->_offset], ptype); \ + } else { \ + chi_p = &in._odata[SE->_offset]; \ + } \ + } else { \ + chi_p = &buf[SE->_offset]; \ + } \ + multLink(Uchi, U._odata[sU], *chi_p, Dir); + +#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \ + SE = st.GetEntry(ptype, Dir+skew, sF); \ + if (SE->_is_local ) { \ + if (SE->_permute) { \ + chi_p = χ \ + permute(chi, in._odata[SE->_offset], ptype); \ + } else { \ + chi_p = &in._odata[SE->_offset]; \ + } \ + } else if ( st.same_node[Dir] ) { \ + chi_p = &buf[SE->_offset]; \ + } \ + if (SE->_is_local || st.same_node[Dir] ) { \ + multLink(Uchi, U._odata[sU], *chi_p, Dir); \ + } + +#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \ + SE = st.GetEntry(ptype, Dir+skew, sF); \ + if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \ + nmu++; \ + chi_p = &buf[SE->_offset]; \ + multLink(Uchi, U._odata[sU], *chi_p, Dir); \ + } template StaggeredKernels::StaggeredKernels(const ImplParams &p) : Base(p){}; -//////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////// // Generic implementation; move to different file? -//////////////////////////////////////////// - +// Int, Ext, Int+Ext cases for comms overlap +//////////////////////////////////////////////////////////////////////////////////// template -void StaggeredKernels::DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - SiteSpinor *buf, int sF, - int sU, const FermionField &in, SiteSpinor &out,int threeLink) { +void StaggeredKernels::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out, int dag) { const SiteSpinor *chi_p; SiteSpinor chi; SiteSpinor Uchi; StencilEntry *SE; int ptype; - int skew = 0; - if (threeLink) skew=8; - /////////////////////////// - // Xp - /////////////////////////// + int skew; - SE = st.GetEntry(ptype, Xp+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; + for(int s=0;s_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Yp); - - /////////////////////////// - // Zp - /////////////////////////// - SE = st.GetEntry(ptype, Zp+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zp); - - /////////////////////////// - // Tp - /////////////////////////// - SE = st.GetEntry(ptype, Tp+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tp); - - /////////////////////////// - // Xm - /////////////////////////// - SE = st.GetEntry(ptype, Xm+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Xm); - - /////////////////////////// - // Ym - /////////////////////////// - SE = st.GetEntry(ptype, Ym+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Ym); - - /////////////////////////// - // Zm - /////////////////////////// - SE = st.GetEntry(ptype, Zm+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zm); - - /////////////////////////// - // Tm - /////////////////////////// - SE = st.GetEntry(ptype, Tm+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tm); - - vstream(out, Uchi); }; + /////////////////////////////////////////////////// + // Only contributions from interior of our node + /////////////////////////////////////////////////// +template +void StaggeredKernels::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { + const SiteSpinor *chi_p; + SiteSpinor chi; + SiteSpinor Uchi; + StencilEntry *SE; + int ptype; + int skew ; + + for(int s=0;s +void StaggeredKernels::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { + const SiteSpinor *chi_p; + SiteSpinor chi; + SiteSpinor Uchi; + StencilEntry *SE; + int ptype; + int nmu=0; + int skew ; + + for(int s=0;s void StaggeredKernels::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, int sU, - const FermionField &in, FermionField &out) { - SiteSpinor naik; - SiteSpinor naive; - int oneLink =0; - int threeLink=1; + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out, + int interior,int exterior) +{ int dag=1; - switch(Opt) { -#ifdef AVX512 - //FIXME; move the sign into the Asm routine - case OptInlineAsm: - DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out); - for(int s=0;s void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out) + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { assert(0); }; @@ -645,10 +681,9 @@ void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, // This is the single precision 5th direction vectorised kernel #include template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out) + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { #ifdef AVX512 uint64_t gauge0,gauge1,gauge2,gauge3; @@ -685,7 +720,11 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3); addr0 = (uint64_t) &out._odata[sF]; - REDUCE(addr0); + if ( dag ) { + nREDUCE(addr0); + } else { + REDUCE(addr0); + } } #else assert(0); @@ -695,10 +734,9 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl #include template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out) + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { #ifdef AVX512 uint64_t gauge0,gauge1,gauge2,gauge3; @@ -734,7 +772,11 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3); addr0 = (uint64_t) &out._odata[sF]; - REDUCE(addr0); + if ( dag ) { + nREDUCE(addr0); + } else { + REDUCE(addr0); + } } #else assert(0); @@ -776,10 +818,9 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl #include template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out) + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { #ifdef AVX512 uint64_t gauge0,gauge1,gauge2,gauge3; @@ -832,7 +873,11 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, MULT_ADD_XYZT(gauge2,gauge3); addr0 = (uint64_t) &out._odata[sF]; - REDUCEa(addr0); + if ( dag ) { + nREDUCEa(addr0); + } else { + REDUCEa(addr0); + } } #else assert(0); @@ -841,10 +886,9 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, #include template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out) + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { #ifdef AVX512 uint64_t gauge0,gauge1,gauge2,gauge3; @@ -897,7 +941,11 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, MULT_ADD_XYZT(gauge2,gauge3); addr0 = (uint64_t) &out._odata[sF]; - REDUCEa(addr0); + if ( dag ) { + nREDUCEa(addr0); + } else { + REDUCEa(addr0); + } } #else assert(0); @@ -909,7 +957,7 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, DoubledGaugeField &U, \ DoubledGaugeField &UUU, \ SiteSpinor *buf, int LLs, \ - int sU, const FermionField &in, FermionField &out); + int sU, const FermionField &in, FermionField &out,int dag); KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD); KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF); diff --git a/lib/qcd/action/fermion/StaggeredKernelsHand.cc b/lib/qcd/action/fermion/StaggeredKernelsHand.cc index 7de8480c..47ebdd86 100644 --- a/lib/qcd/action/fermion/StaggeredKernelsHand.cc +++ b/lib/qcd/action/fermion/StaggeredKernelsHand.cc @@ -28,7 +28,6 @@ Author: paboyle /* END LEGAL */ #include -#define REGISTER #define LOAD_CHI(b) \ const SiteSpinor & ref (b[offset]); \ @@ -59,7 +58,7 @@ Author: paboyle UChi ## _1 += U_12*Chi_2;\ UChi ## _2 += U_22*Chi_2; -#define MULT_ADD(A,UChi) \ +#define MULT_ADD(U,A,UChi) \ auto & ref(U._odata[sU](A)); \ Impl::loadLinkElement(U_00,ref()(0,0)); \ Impl::loadLinkElement(U_10,ref()(1,0)); \ @@ -82,241 +81,319 @@ Author: paboyle #define PERMUTE_DIR(dir) \ - permute##dir(Chi_0,Chi_0);\ - permute##dir(Chi_1,Chi_1);\ - permute##dir(Chi_2,Chi_2); + permute##dir(Chi_0,Chi_0); \ + permute##dir(Chi_1,Chi_1); \ + permute##dir(Chi_2,Chi_2); + + +#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \ + SE=st.GetEntry(ptype,Dir+skew,sF); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHI(in._odata); \ + if ( perm) { \ + PERMUTE_DIR(Perm); \ + } \ + } else { \ + LOAD_CHI(buf); \ + } + +#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \ + HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \ + { \ + MULT(Dir,even); \ + } + +#define HAND_STENCIL_LEG(U,Dir,Perm,skew,even) \ + HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \ + { \ + MULT_ADD(U,Dir,even); \ + } + + + +#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \ + SE=st.GetEntry(ptype,Dir+skew,sF); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHI(in._odata); \ + if ( perm) { \ + PERMUTE_DIR(Perm); \ + } \ + } else if ( st.same_node[Dir] ) { \ + LOAD_CHI(buf); \ + } \ + if (SE->_is_local || st.same_node[Dir] ) { \ + MULT_ADD(U,Dir,even); \ + } + +#define HAND_STENCIL_LEG_EXT(U,Dir,Perm,skew,even) \ + SE=st.GetEntry(ptype,Dir+skew,sF); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \ + nmu++; \ + { LOAD_CHI(buf); } \ + { MULT_ADD(U,Dir,even); } \ + } namespace Grid { namespace QCD { template -void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out, int dag) -{ - SiteSpinor naik; - SiteSpinor naive; - int oneLink =0; - int threeLink=1; - int skew(0); - Real scale(1.0); - - if(dag) scale = -1.0; - - for(int s=0;s -void StaggeredKernels::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - SiteSpinor *buf, int sF, - int sU, const FermionField &in, SiteSpinor &out,int threeLink) +void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U,DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; - REGISTER Simd even_0; // 12 regs on knc - REGISTER Simd even_1; - REGISTER Simd even_2; - REGISTER Simd odd_0; // 12 regs on knc - REGISTER Simd odd_1; - REGISTER Simd odd_2; + Simd even_0; // 12 regs on knc + Simd even_1; + Simd even_2; + Simd odd_0; // 12 regs on knc + Simd odd_1; + Simd odd_2; - REGISTER Simd Chi_0; // two spinor; 6 regs - REGISTER Simd Chi_1; - REGISTER Simd Chi_2; - - REGISTER Simd U_00; // two rows of U matrix - REGISTER Simd U_10; - REGISTER Simd U_20; - REGISTER Simd U_01; - REGISTER Simd U_11; - REGISTER Simd U_21; // 2 reg left. - REGISTER Simd U_02; - REGISTER Simd U_12; - REGISTER Simd U_22; - - int skew = 0; - if (threeLink) skew=8; + Simd Chi_0; // two spinor; 6 regs + Simd Chi_1; + Simd Chi_2; + + Simd U_00; // two rows of U matrix + Simd U_10; + Simd U_20; + Simd U_01; + Simd U_11; + Simd U_21; // 2 reg left. + Simd U_02; + Simd U_12; + Simd U_22; + SiteSpinor result; int offset,local,perm, ptype; + StencilEntry *SE; + int skew; - // Xp - SE=st.GetEntry(ptype,Xp+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT(Xp,even); - } - - // Yp - SE=st.GetEntry(ptype,Yp+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT(Yp,odd); - } + for(int s=0;s_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + skew = 0; + HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even); + HAND_STENCIL_LEG_BEGIN(Yp,2,skew,odd); + HAND_STENCIL_LEG (U,Zp,1,skew,even); + HAND_STENCIL_LEG (U,Tp,0,skew,odd); + HAND_STENCIL_LEG (U,Xm,3,skew,even); + HAND_STENCIL_LEG (U,Ym,2,skew,odd); + HAND_STENCIL_LEG (U,Zm,1,skew,even); + HAND_STENCIL_LEG (U,Tm,0,skew,odd); + skew = 8; + HAND_STENCIL_LEG(UUU,Xp,3,skew,even); + HAND_STENCIL_LEG(UUU,Yp,2,skew,odd); + HAND_STENCIL_LEG(UUU,Zp,1,skew,even); + HAND_STENCIL_LEG(UUU,Tp,0,skew,odd); + HAND_STENCIL_LEG(UUU,Xm,3,skew,even); + HAND_STENCIL_LEG(UUU,Ym,2,skew,odd); + HAND_STENCIL_LEG(UUU,Zm,1,skew,even); + HAND_STENCIL_LEG(UUU,Tm,0,skew,odd); + + if ( dag ) { + result()()(0) = - even_0 - odd_0; + result()()(1) = - even_1 - odd_1; + result()()(2) = - even_2 - odd_2; + } else { + result()()(0) = even_0 + odd_0; + result()()(1) = even_1 + odd_1; + result()()(2) = even_2 + odd_2; } - } else { - LOAD_CHI(buf); + vstream(out._odata[sF],result); } - { - MULT_ADD(Zp,even); - } - - // Tp - SE=st.GetEntry(ptype,Tp+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT_ADD(Tp,odd); - } - - // Xm - SE=st.GetEntry(ptype,Xm+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT_ADD(Xm,even); - } - - - // Ym - SE=st.GetEntry(ptype,Ym+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT_ADD(Ym,odd); - } - - // Zm - SE=st.GetEntry(ptype,Zm+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT_ADD(Zm,even); - } - - // Tm - SE=st.GetEntry(ptype,Tm+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT_ADD(Tm,odd); - } - - vstream(out()()(0),even_0+odd_0); - vstream(out()()(1),even_1+odd_1); - vstream(out()()(2),even_2+odd_2); - } + +template +void StaggeredKernels::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + Simd even_0; // 12 regs on knc + Simd even_1; + Simd even_2; + Simd odd_0; // 12 regs on knc + Simd odd_1; + Simd odd_2; + + Simd Chi_0; // two spinor; 6 regs + Simd Chi_1; + Simd Chi_2; + + Simd U_00; // two rows of U matrix + Simd U_10; + Simd U_20; + Simd U_01; + Simd U_11; + Simd U_21; // 2 reg left. + Simd U_02; + Simd U_12; + Simd U_22; + + SiteSpinor result; + int offset,local,perm, ptype; + + StencilEntry *SE; + int skew; + + for(int s=0;s +void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + Simd even_0; // 12 regs on knc + Simd even_1; + Simd even_2; + Simd odd_0; // 12 regs on knc + Simd odd_1; + Simd odd_2; + + Simd Chi_0; // two spinor; 6 regs + Simd Chi_1; + Simd Chi_2; + + Simd U_00; // two rows of U matrix + Simd U_10; + Simd U_20; + Simd U_01; + Simd U_11; + Simd U_21; // 2 reg left. + Simd U_02; + Simd U_12; + Simd U_22; + + SiteSpinor result; + int offset,local,perm, ptype; + + StencilEntry *SE; + int skew; + + for(int s=0;s::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \ DoubledGaugeField &U,DoubledGaugeField &UUU, \ - SiteSpinor *buf, int LLs, \ - int sU, const FermionField &in, FermionField &out, int dag); + SiteSpinor *buf, int LLs, int sU, \ + const FermionField &in, FermionField &out, int dag); \ + \ + template void StaggeredKernels::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \ + DoubledGaugeField &U,DoubledGaugeField &UUU, \ + SiteSpinor *buf, int LLs, int sU, \ + const FermionField &in, FermionField &out, int dag); \ + \ + template void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \ + DoubledGaugeField &U,DoubledGaugeField &UUU, \ + SiteSpinor *buf, int LLs, int sU, \ + const FermionField &in, FermionField &out, int dag); \ -#define DHOP_SITE_DEPTH_HAND_INSTANTIATE(IMPL) \ - template void StaggeredKernels::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, \ - SiteSpinor *buf, int sF, \ - int sU, const FermionField &in, SiteSpinor &out,int threeLink) ; DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD); DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF); DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD); DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF); -DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplD); -DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplF); -DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplD); -DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplF); -}} +} +} + diff --git a/lib/qcd/action/fermion/WilsonCompressor.h b/lib/qcd/action/fermion/WilsonCompressor.h index cc5c3c63..3660cc0b 100644 --- a/lib/qcd/action/fermion/WilsonCompressor.h +++ b/lib/qcd/action/fermion/WilsonCompressor.h @@ -267,41 +267,16 @@ public: } typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; - std::vector same_node; - std::vector surface_list; - WilsonStencil(GridBase *grid, int npoints, int checkerboard, const std::vector &directions, const std::vector &distances) - : CartesianStencil (grid,npoints,checkerboard,directions,distances) , - same_node(npoints) + : CartesianStencil (grid,npoints,checkerboard,directions,distances) { ZeroCountersi(); - surface_list.resize(0); }; - void BuildSurfaceList(int Ls,int vol4){ - - // find same node for SHM - // Here we know the distance is 1 for WilsonStencil - for(int point=0;point_npoints;point++){ - same_node[point] = this->SameNode(point); - } - - for(int site = 0 ;site< vol4;site++){ - int local = 1; - for(int point=0;point_npoints;point++){ - if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){ - local = 0; - } - } - if(local == 0) { - surface_list.push_back(site); - } - } - } template < class compressor> void HaloExchangeOpt(const Lattice &source,compressor &compress) @@ -362,23 +337,23 @@ public: int dag = compress.dag; int face_idx=0; if ( dag ) { - assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); - assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); - assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); - assert(same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx)); - assert(same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx)); - assert(same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx)); - assert(same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); - assert(same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx)); + assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); + assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); + assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); + assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx)); + assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx)); + assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx)); + assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); + assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx)); } else { - assert(same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx)); - assert(same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx)); - assert(same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); - assert(same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx)); - assert(same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx)); - assert(same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx)); - assert(same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); - assert(same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx)); + assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx)); + assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx)); + assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); + assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx)); + assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx)); + assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx)); + assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); + assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx)); } this->face_table_computed=1; assert(this->u_comm_offset==this->_unified_buffer_size); diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 1da58ddb..78124beb 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -444,8 +444,7 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg } } ptime = usecond() - start; - } - { + } else { double start = usecond(); st.CommunicateThreaded(); ctime = usecond() - start; diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index 887d8a7c..d1c4ae95 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -149,7 +149,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal std::vector _distances; std::vector _comm_buf_size; std::vector _permute_type; - + std::vector same_node; + std::vector surface_list; + Vector _entries; std::vector Packets; std::vector Mergers; @@ -200,7 +202,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal int dimension = _directions[point]; int displacement = _distances[point]; - assert( (displacement==1) || (displacement==-1)); + int pd = _grid->_processors[dimension]; int fd = _grid->_fdimensions[dimension]; @@ -215,9 +217,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal if ( ! comm_dim ) return 1; int nbr_proc; - if (displacement==1) nbr_proc = 1; - else nbr_proc = pd-1; + if (displacement>0) nbr_proc = 1; + else nbr_proc = pd-1; + // FIXME this logic needs to be sorted for three link term + // assert( (displacement==1) || (displacement==-1)); + // Present hack only works for >= 4^4 subvol per node _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p); @@ -539,6 +544,29 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal } }; + // Move interior/exterior split into the generic stencil + // FIXME Explicit Ls in interface is a pain. Should just use a vol + void BuildSurfaceList(int Ls,int vol4){ + + // find same node for SHM + // Here we know the distance is 1 for WilsonStencil + for(int point=0;point_npoints;point++){ + same_node[point] = this->SameNode(point); + } + + for(int site = 0 ;site< vol4;site++){ + int local = 1; + for(int point=0;point_npoints;point++){ + if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){ + local = 0; + } + } + if(local == 0) { + surface_list.push_back(site); + } + } + } + CartesianStencil(GridBase *grid, int npoints, int checkerboard, @@ -549,7 +577,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal comm_bytes_thr(npoints), comm_enter_thr(npoints), comm_leave_thr(npoints), - comm_time_thr(npoints) + comm_time_thr(npoints), + same_node(npoints) { face_table_computed=0; _npoints = npoints; @@ -557,6 +586,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal _directions = directions; _distances = distances; _unified_buffer_size=0; + surface_list.resize(0); int osites = _grid->oSites(); diff --git a/lib/util/Init.cc b/lib/util/Init.cc index 031f8f5a..13b8f219 100644 --- a/lib/util/Init.cc +++ b/lib/util/Init.cc @@ -360,8 +360,10 @@ void Grid_init(int *argc,char ***argv) } if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){ QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute; + QCD::StaggeredKernelsStatic::Comms = QCD::StaggeredKernelsStatic::CommsAndCompute; } else { QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute; + QCD::StaggeredKernelsStatic::Comms = QCD::StaggeredKernelsStatic::CommsThenCompute; } if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){ CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);