From 24162c9eadb6316f2125c2430c51440969cd2310 Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Tue, 9 Jan 2018 13:02:52 +0000 Subject: [PATCH 01/25] Staggered overlap comms comput --- .../fermion/ImprovedStaggeredFermion5D.cc | 164 ++++++ .../fermion/ImprovedStaggeredFermion5D.h | 19 + lib/qcd/action/fermion/StaggeredKernels.cc | 389 +++++++------- lib/qcd/action/fermion/StaggeredKernels.h | 79 ++- lib/qcd/action/fermion/StaggeredKernelsAsm.cc | 106 +++- .../action/fermion/StaggeredKernelsHand.cc | 497 ++++++++++-------- lib/qcd/action/fermion/WilsonCompressor.h | 59 +-- lib/qcd/action/fermion/WilsonFermion5D.cc | 3 +- lib/stencil/Stencil.h | 40 +- lib/util/Init.cc | 2 + 10 files changed, 869 insertions(+), 489 deletions(-) diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc index 7d988d89..2df4c044 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc @@ -124,6 +124,15 @@ ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GaugeField &_Uthin, // Allocate the required comms buffer ImportGauge(_Uthin,_Ufat); + + int LLs = FiveDimGrid._rdimensions[0]; + int vol4= FourDimGrid.oSites(); + Stencil.BuildSurfaceList(LLs,vol4); + + vol4=FourDimRedBlackGrid.oSites(); + StencilEven.BuildSurfaceList(LLs,vol4); + StencilOdd.BuildSurfaceList(LLs,vol4); + } template @@ -223,6 +232,157 @@ void ImprovedStaggeredFermion5D::DhopDerivOE(GaugeField &mat, assert(0); } +/*CHANGE */ +template +void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, + DoubledGaugeField & U,DoubledGaugeField & UUU, + const FermionField &in, FermionField &out,int dag) +{ + DhopTotalTime-=usecond(); +#ifdef GRID_OMP + if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) + DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); + else +#endif + DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); + DhopTotalTime+=usecond(); +} + +template +void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo, + DoubledGaugeField & U,DoubledGaugeField & UUU, + const FermionField &in, FermionField &out,int dag) +{ +#ifdef GRID_OMP + // assert((dag==DaggerNo) ||(dag==DaggerYes)); + + Compressor compressor; + + int LLs = in._grid->_rdimensions[0]; + int len = U._grid->oSites(); + + DhopFaceTime-=usecond(); + st.Prepare(); + st.HaloGather(in,compressor); + // st.HaloExchangeOptGather(in,compressor); // Wilson compressor + st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms + DhopFaceTime+=usecond(); + + double ctime=0; + double ptime=0; + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // Ugly explicit thread mapping introduced for OPA reasons. + ////////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma omp parallel reduction(max:ctime) reduction(max:ptime) + { + int tid = omp_get_thread_num(); + int nthreads = omp_get_num_threads(); + int ncomms = CartesianCommunicator::nCommThreads; + if (ncomms == -1) ncomms = 1; + assert(nthreads > ncomms); + if (tid >= ncomms) { + double start = usecond(); + nthreads -= ncomms; + int ttid = tid - ncomms; + int n = U._grid->oSites(); // 4d vol + int chunk = n / nthreads; + int rem = n % nthreads; + int myblock, myn; + if (ttid < rem) { + myblock = ttid * chunk + ttid; + myn = chunk+1; + } else { + myblock = ttid*chunk + rem; + myn = chunk; + } + + // do the compute + if (dag == DaggerYes) { + for (int ss = myblock; ss < myblock+myn; ++ss) { + int sU = ss; + // Interior = 1; Exterior = 0; must implement for staggered + Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<--------- + } + } else { + for (int ss = myblock; ss < myblock+myn; ++ss) { + // Interior = 1; Exterior = 0; + int sU = ss; + Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<------------ + } + } + ptime = usecond() - start; + } else { + double start = usecond(); + st.CommunicateThreaded(); + ctime = usecond() - start; + } + } + DhopCommTime += ctime; + DhopComputeTime+=ptime; + + // First to enter, last to leave timing + st.CollateThreads(); + + DhopFaceTime-=usecond(); + st.CommsMerge(compressor); + DhopFaceTime+=usecond(); + + DhopComputeTime2-=usecond(); + if (dag == DaggerYes) { + int sz=st.surface_list.size(); + parallel_for (int ss = 0; ss < sz; ss++) { + int sU = st.surface_list[ss]; + Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1); //<---------- + } + } else { + int sz=st.surface_list.size(); + parallel_for (int ss = 0; ss < sz; ss++) { + int sU = st.surface_list[ss]; + Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1);//<---------- + } + } + DhopComputeTime2+=usecond(); +#else + assert(0); +#endif + +} + +template +void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, + DoubledGaugeField & U,DoubledGaugeField & UUU, + const FermionField &in, FermionField &out,int dag) +{ + Compressor compressor; + int LLs = in._grid->_rdimensions[0]; + + + + DhopTotalTime -= usecond(); + DhopCommTime -= usecond(); + st.HaloExchange(in,compressor); + DhopCommTime += usecond(); + + DhopComputeTime -= usecond(); + // Dhop takes the 4d grid from U, and makes a 5d index for fermion + if (dag == DaggerYes) { + parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { + int sU=ss; + Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out); + } + } else { + parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { + int sU=ss; + Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out); + } + } + DhopComputeTime += usecond(); + DhopTotalTime += usecond(); +} +/*CHANGE END*/ + +/* ORG template void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U,DoubledGaugeField & UUU, @@ -254,6 +414,7 @@ void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOr DhopComputeTime += usecond(); DhopTotalTime += usecond(); } +*/ template @@ -336,6 +497,9 @@ void ImprovedStaggeredFermion5D::ZeroCounters(void) DhopTotalTime = 0; DhopCommTime = 0; DhopComputeTime = 0; + DhopFaceTime = 0; + + Stencil.ZeroCounters(); StencilEven.ZeroCounters(); StencilOdd.ZeroCounters(); diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h index ca1a955a..e21142b8 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -64,6 +64,8 @@ namespace QCD { double DhopCalls; double DhopCommTime; double DhopComputeTime; + double DhopComputeTime2; + double DhopFaceTime; /////////////////////////////////////////////////////////////// // Implement the abstract base @@ -119,6 +121,23 @@ namespace QCD { FermionField &out, int dag); + void DhopInternalOverlappedComms(StencilImpl & st, + LebesgueOrder &lo, + DoubledGaugeField &U, + DoubledGaugeField &UUU, + const FermionField &in, + FermionField &out, + int dag); + + void DhopInternalSerialComms(StencilImpl & st, + LebesgueOrder &lo, + DoubledGaugeField &U, + DoubledGaugeField &UUU, + const FermionField &in, + FermionField &out, + int dag); + + // Constructors ImprovedStaggeredFermion5D(GaugeField &_Uthin, GaugeField &_Ufat, diff --git a/lib/qcd/action/fermion/StaggeredKernels.cc b/lib/qcd/action/fermion/StaggeredKernels.cc index b6ec14c7..ea44cdd8 100644 --- a/lib/qcd/action/fermion/StaggeredKernels.cc +++ b/lib/qcd/action/fermion/StaggeredKernels.cc @@ -32,223 +32,242 @@ namespace Grid { namespace QCD { int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric; +int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute; + +#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \ + SE = st.GetEntry(ptype, Dir+skew, sF); \ + if (SE->_is_local ) { \ + if (SE->_permute) { \ + chi_p = χ \ + permute(chi, in._odata[SE->_offset], ptype); \ + } else { \ + chi_p = &in._odata[SE->_offset]; \ + } \ + } else { \ + chi_p = &buf[SE->_offset]; \ + } \ + multLink(Uchi, U._odata[sU], *chi_p, Dir); + +#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \ + SE = st.GetEntry(ptype, Dir+skew, sF); \ + if (SE->_is_local ) { \ + if (SE->_permute) { \ + chi_p = χ \ + permute(chi, in._odata[SE->_offset], ptype); \ + } else { \ + chi_p = &in._odata[SE->_offset]; \ + } \ + } else if ( st.same_node[Dir] ) { \ + chi_p = &buf[SE->_offset]; \ + } \ + if (SE->_is_local || st.same_node[Dir] ) { \ + multLink(Uchi, U._odata[sU], *chi_p, Dir); \ + } + +#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \ + SE = st.GetEntry(ptype, Dir+skew, sF); \ + if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \ + nmu++; \ + chi_p = &buf[SE->_offset]; \ + multLink(Uchi, U._odata[sU], *chi_p, Dir); \ + } template StaggeredKernels::StaggeredKernels(const ImplParams &p) : Base(p){}; -//////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////// // Generic implementation; move to different file? -//////////////////////////////////////////// - +// Int, Ext, Int+Ext cases for comms overlap +//////////////////////////////////////////////////////////////////////////////////// template -void StaggeredKernels::DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - SiteSpinor *buf, int sF, - int sU, const FermionField &in, SiteSpinor &out,int threeLink) { +void StaggeredKernels::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out, int dag) { const SiteSpinor *chi_p; SiteSpinor chi; SiteSpinor Uchi; StencilEntry *SE; int ptype; - int skew = 0; - if (threeLink) skew=8; - /////////////////////////// - // Xp - /////////////////////////// + int skew; - SE = st.GetEntry(ptype, Xp+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; + for(int s=0;s_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Yp); - - /////////////////////////// - // Zp - /////////////////////////// - SE = st.GetEntry(ptype, Zp+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zp); - - /////////////////////////// - // Tp - /////////////////////////// - SE = st.GetEntry(ptype, Tp+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tp); - - /////////////////////////// - // Xm - /////////////////////////// - SE = st.GetEntry(ptype, Xm+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Xm); - - /////////////////////////// - // Ym - /////////////////////////// - SE = st.GetEntry(ptype, Ym+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Ym); - - /////////////////////////// - // Zm - /////////////////////////// - SE = st.GetEntry(ptype, Zm+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zm); - - /////////////////////////// - // Tm - /////////////////////////// - SE = st.GetEntry(ptype, Tm+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tm); - - vstream(out, Uchi); }; + /////////////////////////////////////////////////// + // Only contributions from interior of our node + /////////////////////////////////////////////////// +template +void StaggeredKernels::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { + const SiteSpinor *chi_p; + SiteSpinor chi; + SiteSpinor Uchi; + StencilEntry *SE; + int ptype; + int skew ; + + for(int s=0;s +void StaggeredKernels::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { + const SiteSpinor *chi_p; + SiteSpinor chi; + SiteSpinor Uchi; + StencilEntry *SE; + int ptype; + int nmu=0; + int skew ; + + for(int s=0;s void StaggeredKernels::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, int sU, - const FermionField &in, FermionField &out) { - SiteSpinor naik; - SiteSpinor naive; - int oneLink =0; - int threeLink=1; + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out, + int interior,int exterior) +{ int dag=1; - switch(Opt) { -#ifdef AVX512 - //FIXME; move the sign into the Asm routine - case OptInlineAsm: - DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out); - for(int s=0;s void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out) + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { assert(0); }; @@ -645,10 +681,9 @@ void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, // This is the single precision 5th direction vectorised kernel #include template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out) + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { #ifdef AVX512 uint64_t gauge0,gauge1,gauge2,gauge3; @@ -685,7 +720,11 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3); addr0 = (uint64_t) &out._odata[sF]; - REDUCE(addr0); + if ( dag ) { + nREDUCE(addr0); + } else { + REDUCE(addr0); + } } #else assert(0); @@ -695,10 +734,9 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl #include template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out) + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { #ifdef AVX512 uint64_t gauge0,gauge1,gauge2,gauge3; @@ -734,7 +772,11 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3); addr0 = (uint64_t) &out._odata[sF]; - REDUCE(addr0); + if ( dag ) { + nREDUCE(addr0); + } else { + REDUCE(addr0); + } } #else assert(0); @@ -776,10 +818,9 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl #include template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out) + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { #ifdef AVX512 uint64_t gauge0,gauge1,gauge2,gauge3; @@ -832,7 +873,11 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, MULT_ADD_XYZT(gauge2,gauge3); addr0 = (uint64_t) &out._odata[sF]; - REDUCEa(addr0); + if ( dag ) { + nREDUCEa(addr0); + } else { + REDUCEa(addr0); + } } #else assert(0); @@ -841,10 +886,9 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, #include template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out) + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { #ifdef AVX512 uint64_t gauge0,gauge1,gauge2,gauge3; @@ -897,7 +941,11 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, MULT_ADD_XYZT(gauge2,gauge3); addr0 = (uint64_t) &out._odata[sF]; - REDUCEa(addr0); + if ( dag ) { + nREDUCEa(addr0); + } else { + REDUCEa(addr0); + } } #else assert(0); @@ -909,7 +957,7 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, DoubledGaugeField &U, \ DoubledGaugeField &UUU, \ SiteSpinor *buf, int LLs, \ - int sU, const FermionField &in, FermionField &out); + int sU, const FermionField &in, FermionField &out,int dag); KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD); KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF); diff --git a/lib/qcd/action/fermion/StaggeredKernelsHand.cc b/lib/qcd/action/fermion/StaggeredKernelsHand.cc index 7de8480c..47ebdd86 100644 --- a/lib/qcd/action/fermion/StaggeredKernelsHand.cc +++ b/lib/qcd/action/fermion/StaggeredKernelsHand.cc @@ -28,7 +28,6 @@ Author: paboyle /* END LEGAL */ #include -#define REGISTER #define LOAD_CHI(b) \ const SiteSpinor & ref (b[offset]); \ @@ -59,7 +58,7 @@ Author: paboyle UChi ## _1 += U_12*Chi_2;\ UChi ## _2 += U_22*Chi_2; -#define MULT_ADD(A,UChi) \ +#define MULT_ADD(U,A,UChi) \ auto & ref(U._odata[sU](A)); \ Impl::loadLinkElement(U_00,ref()(0,0)); \ Impl::loadLinkElement(U_10,ref()(1,0)); \ @@ -82,241 +81,319 @@ Author: paboyle #define PERMUTE_DIR(dir) \ - permute##dir(Chi_0,Chi_0);\ - permute##dir(Chi_1,Chi_1);\ - permute##dir(Chi_2,Chi_2); + permute##dir(Chi_0,Chi_0); \ + permute##dir(Chi_1,Chi_1); \ + permute##dir(Chi_2,Chi_2); + + +#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \ + SE=st.GetEntry(ptype,Dir+skew,sF); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHI(in._odata); \ + if ( perm) { \ + PERMUTE_DIR(Perm); \ + } \ + } else { \ + LOAD_CHI(buf); \ + } + +#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \ + HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \ + { \ + MULT(Dir,even); \ + } + +#define HAND_STENCIL_LEG(U,Dir,Perm,skew,even) \ + HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \ + { \ + MULT_ADD(U,Dir,even); \ + } + + + +#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \ + SE=st.GetEntry(ptype,Dir+skew,sF); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHI(in._odata); \ + if ( perm) { \ + PERMUTE_DIR(Perm); \ + } \ + } else if ( st.same_node[Dir] ) { \ + LOAD_CHI(buf); \ + } \ + if (SE->_is_local || st.same_node[Dir] ) { \ + MULT_ADD(U,Dir,even); \ + } + +#define HAND_STENCIL_LEG_EXT(U,Dir,Perm,skew,even) \ + SE=st.GetEntry(ptype,Dir+skew,sF); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \ + nmu++; \ + { LOAD_CHI(buf); } \ + { MULT_ADD(U,Dir,even); } \ + } namespace Grid { namespace QCD { template -void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out, int dag) -{ - SiteSpinor naik; - SiteSpinor naive; - int oneLink =0; - int threeLink=1; - int skew(0); - Real scale(1.0); - - if(dag) scale = -1.0; - - for(int s=0;s -void StaggeredKernels::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - SiteSpinor *buf, int sF, - int sU, const FermionField &in, SiteSpinor &out,int threeLink) +void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U,DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; - REGISTER Simd even_0; // 12 regs on knc - REGISTER Simd even_1; - REGISTER Simd even_2; - REGISTER Simd odd_0; // 12 regs on knc - REGISTER Simd odd_1; - REGISTER Simd odd_2; + Simd even_0; // 12 regs on knc + Simd even_1; + Simd even_2; + Simd odd_0; // 12 regs on knc + Simd odd_1; + Simd odd_2; - REGISTER Simd Chi_0; // two spinor; 6 regs - REGISTER Simd Chi_1; - REGISTER Simd Chi_2; - - REGISTER Simd U_00; // two rows of U matrix - REGISTER Simd U_10; - REGISTER Simd U_20; - REGISTER Simd U_01; - REGISTER Simd U_11; - REGISTER Simd U_21; // 2 reg left. - REGISTER Simd U_02; - REGISTER Simd U_12; - REGISTER Simd U_22; - - int skew = 0; - if (threeLink) skew=8; + Simd Chi_0; // two spinor; 6 regs + Simd Chi_1; + Simd Chi_2; + + Simd U_00; // two rows of U matrix + Simd U_10; + Simd U_20; + Simd U_01; + Simd U_11; + Simd U_21; // 2 reg left. + Simd U_02; + Simd U_12; + Simd U_22; + SiteSpinor result; int offset,local,perm, ptype; + StencilEntry *SE; + int skew; - // Xp - SE=st.GetEntry(ptype,Xp+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT(Xp,even); - } - - // Yp - SE=st.GetEntry(ptype,Yp+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT(Yp,odd); - } + for(int s=0;s_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + skew = 0; + HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even); + HAND_STENCIL_LEG_BEGIN(Yp,2,skew,odd); + HAND_STENCIL_LEG (U,Zp,1,skew,even); + HAND_STENCIL_LEG (U,Tp,0,skew,odd); + HAND_STENCIL_LEG (U,Xm,3,skew,even); + HAND_STENCIL_LEG (U,Ym,2,skew,odd); + HAND_STENCIL_LEG (U,Zm,1,skew,even); + HAND_STENCIL_LEG (U,Tm,0,skew,odd); + skew = 8; + HAND_STENCIL_LEG(UUU,Xp,3,skew,even); + HAND_STENCIL_LEG(UUU,Yp,2,skew,odd); + HAND_STENCIL_LEG(UUU,Zp,1,skew,even); + HAND_STENCIL_LEG(UUU,Tp,0,skew,odd); + HAND_STENCIL_LEG(UUU,Xm,3,skew,even); + HAND_STENCIL_LEG(UUU,Ym,2,skew,odd); + HAND_STENCIL_LEG(UUU,Zm,1,skew,even); + HAND_STENCIL_LEG(UUU,Tm,0,skew,odd); + + if ( dag ) { + result()()(0) = - even_0 - odd_0; + result()()(1) = - even_1 - odd_1; + result()()(2) = - even_2 - odd_2; + } else { + result()()(0) = even_0 + odd_0; + result()()(1) = even_1 + odd_1; + result()()(2) = even_2 + odd_2; } - } else { - LOAD_CHI(buf); + vstream(out._odata[sF],result); } - { - MULT_ADD(Zp,even); - } - - // Tp - SE=st.GetEntry(ptype,Tp+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT_ADD(Tp,odd); - } - - // Xm - SE=st.GetEntry(ptype,Xm+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT_ADD(Xm,even); - } - - - // Ym - SE=st.GetEntry(ptype,Ym+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT_ADD(Ym,odd); - } - - // Zm - SE=st.GetEntry(ptype,Zm+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT_ADD(Zm,even); - } - - // Tm - SE=st.GetEntry(ptype,Tm+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT_ADD(Tm,odd); - } - - vstream(out()()(0),even_0+odd_0); - vstream(out()()(1),even_1+odd_1); - vstream(out()()(2),even_2+odd_2); - } + +template +void StaggeredKernels::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + Simd even_0; // 12 regs on knc + Simd even_1; + Simd even_2; + Simd odd_0; // 12 regs on knc + Simd odd_1; + Simd odd_2; + + Simd Chi_0; // two spinor; 6 regs + Simd Chi_1; + Simd Chi_2; + + Simd U_00; // two rows of U matrix + Simd U_10; + Simd U_20; + Simd U_01; + Simd U_11; + Simd U_21; // 2 reg left. + Simd U_02; + Simd U_12; + Simd U_22; + + SiteSpinor result; + int offset,local,perm, ptype; + + StencilEntry *SE; + int skew; + + for(int s=0;s +void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + Simd even_0; // 12 regs on knc + Simd even_1; + Simd even_2; + Simd odd_0; // 12 regs on knc + Simd odd_1; + Simd odd_2; + + Simd Chi_0; // two spinor; 6 regs + Simd Chi_1; + Simd Chi_2; + + Simd U_00; // two rows of U matrix + Simd U_10; + Simd U_20; + Simd U_01; + Simd U_11; + Simd U_21; // 2 reg left. + Simd U_02; + Simd U_12; + Simd U_22; + + SiteSpinor result; + int offset,local,perm, ptype; + + StencilEntry *SE; + int skew; + + for(int s=0;s::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \ DoubledGaugeField &U,DoubledGaugeField &UUU, \ - SiteSpinor *buf, int LLs, \ - int sU, const FermionField &in, FermionField &out, int dag); + SiteSpinor *buf, int LLs, int sU, \ + const FermionField &in, FermionField &out, int dag); \ + \ + template void StaggeredKernels::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \ + DoubledGaugeField &U,DoubledGaugeField &UUU, \ + SiteSpinor *buf, int LLs, int sU, \ + const FermionField &in, FermionField &out, int dag); \ + \ + template void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \ + DoubledGaugeField &U,DoubledGaugeField &UUU, \ + SiteSpinor *buf, int LLs, int sU, \ + const FermionField &in, FermionField &out, int dag); \ -#define DHOP_SITE_DEPTH_HAND_INSTANTIATE(IMPL) \ - template void StaggeredKernels::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, \ - SiteSpinor *buf, int sF, \ - int sU, const FermionField &in, SiteSpinor &out,int threeLink) ; DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD); DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF); DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD); DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF); -DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplD); -DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplF); -DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplD); -DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplF); -}} +} +} + diff --git a/lib/qcd/action/fermion/WilsonCompressor.h b/lib/qcd/action/fermion/WilsonCompressor.h index cc5c3c63..3660cc0b 100644 --- a/lib/qcd/action/fermion/WilsonCompressor.h +++ b/lib/qcd/action/fermion/WilsonCompressor.h @@ -267,41 +267,16 @@ public: } typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; - std::vector same_node; - std::vector surface_list; - WilsonStencil(GridBase *grid, int npoints, int checkerboard, const std::vector &directions, const std::vector &distances) - : CartesianStencil (grid,npoints,checkerboard,directions,distances) , - same_node(npoints) + : CartesianStencil (grid,npoints,checkerboard,directions,distances) { ZeroCountersi(); - surface_list.resize(0); }; - void BuildSurfaceList(int Ls,int vol4){ - - // find same node for SHM - // Here we know the distance is 1 for WilsonStencil - for(int point=0;point_npoints;point++){ - same_node[point] = this->SameNode(point); - } - - for(int site = 0 ;site< vol4;site++){ - int local = 1; - for(int point=0;point_npoints;point++){ - if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){ - local = 0; - } - } - if(local == 0) { - surface_list.push_back(site); - } - } - } template < class compressor> void HaloExchangeOpt(const Lattice &source,compressor &compress) @@ -362,23 +337,23 @@ public: int dag = compress.dag; int face_idx=0; if ( dag ) { - assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); - assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); - assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); - assert(same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx)); - assert(same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx)); - assert(same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx)); - assert(same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); - assert(same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx)); + assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); + assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); + assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); + assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx)); + assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx)); + assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx)); + assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); + assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx)); } else { - assert(same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx)); - assert(same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx)); - assert(same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); - assert(same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx)); - assert(same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx)); - assert(same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx)); - assert(same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); - assert(same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx)); + assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx)); + assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx)); + assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); + assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx)); + assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx)); + assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx)); + assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); + assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx)); } this->face_table_computed=1; assert(this->u_comm_offset==this->_unified_buffer_size); diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 1da58ddb..78124beb 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -444,8 +444,7 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg } } ptime = usecond() - start; - } - { + } else { double start = usecond(); st.CommunicateThreaded(); ctime = usecond() - start; diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index 887d8a7c..d1c4ae95 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -149,7 +149,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal std::vector _distances; std::vector _comm_buf_size; std::vector _permute_type; - + std::vector same_node; + std::vector surface_list; + Vector _entries; std::vector Packets; std::vector Mergers; @@ -200,7 +202,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal int dimension = _directions[point]; int displacement = _distances[point]; - assert( (displacement==1) || (displacement==-1)); + int pd = _grid->_processors[dimension]; int fd = _grid->_fdimensions[dimension]; @@ -215,9 +217,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal if ( ! comm_dim ) return 1; int nbr_proc; - if (displacement==1) nbr_proc = 1; - else nbr_proc = pd-1; + if (displacement>0) nbr_proc = 1; + else nbr_proc = pd-1; + // FIXME this logic needs to be sorted for three link term + // assert( (displacement==1) || (displacement==-1)); + // Present hack only works for >= 4^4 subvol per node _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p); @@ -539,6 +544,29 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal } }; + // Move interior/exterior split into the generic stencil + // FIXME Explicit Ls in interface is a pain. Should just use a vol + void BuildSurfaceList(int Ls,int vol4){ + + // find same node for SHM + // Here we know the distance is 1 for WilsonStencil + for(int point=0;point_npoints;point++){ + same_node[point] = this->SameNode(point); + } + + for(int site = 0 ;site< vol4;site++){ + int local = 1; + for(int point=0;point_npoints;point++){ + if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){ + local = 0; + } + } + if(local == 0) { + surface_list.push_back(site); + } + } + } + CartesianStencil(GridBase *grid, int npoints, int checkerboard, @@ -549,7 +577,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal comm_bytes_thr(npoints), comm_enter_thr(npoints), comm_leave_thr(npoints), - comm_time_thr(npoints) + comm_time_thr(npoints), + same_node(npoints) { face_table_computed=0; _npoints = npoints; @@ -557,6 +586,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal _directions = directions; _distances = distances; _unified_buffer_size=0; + surface_list.resize(0); int osites = _grid->oSites(); diff --git a/lib/util/Init.cc b/lib/util/Init.cc index 031f8f5a..13b8f219 100644 --- a/lib/util/Init.cc +++ b/lib/util/Init.cc @@ -360,8 +360,10 @@ void Grid_init(int *argc,char ***argv) } if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){ QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute; + QCD::StaggeredKernelsStatic::Comms = QCD::StaggeredKernelsStatic::CommsAndCompute; } else { QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute; + QCD::StaggeredKernelsStatic::Comms = QCD::StaggeredKernelsStatic::CommsThenCompute; } if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){ CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent); From 63982819c685b852e56fca552896afb78ebe4598 Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Mon, 22 Jan 2018 11:03:39 +0000 Subject: [PATCH 02/25] No option to overlap comms and compute for asm implementation since different directions are interleaved in the kernels, introducing if else structure would be too painful --- lib/qcd/action/fermion/StaggeredKernels.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/lib/qcd/action/fermion/StaggeredKernels.h b/lib/qcd/action/fermion/StaggeredKernels.h index c0c23caa..79de1a68 100644 --- a/lib/qcd/action/fermion/StaggeredKernels.h +++ b/lib/qcd/action/fermion/StaggeredKernels.h @@ -93,14 +93,6 @@ public: DoubledGaugeField &U,DoubledGaugeField &UUU, SiteSpinor * buf, int LLs, int sU, const FermionField &in, FermionField &out,int dag); - void DhopSiteAsmInt(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U,DoubledGaugeField &UUU, - SiteSpinor * buf, int LLs, int sU, - const FermionField &in, FermionField &out,int dag); - void DhopSiteAsmExt(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, DoubledGaugeField &UUU, - SiteSpinor * buf, int LLs, int sU, - const FermionField &in, FermionField &out,int dag); /////////////////////////////////////////////////////////////////////////////////////////////////// // Generic interface; fan out to right routine /////////////////////////////////////////////////////////////////////////////////////////////////// From 97b9c6f03d1a813674cc2e1c52de50ebfac32fdf Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Mon, 22 Jan 2018 11:04:19 +0000 Subject: [PATCH 03/25] No option for interior/exterior split of asm kernels since different directions get interleaved --- lib/qcd/action/fermion/StaggeredKernels.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/qcd/action/fermion/StaggeredKernels.cc b/lib/qcd/action/fermion/StaggeredKernels.cc index ea44cdd8..b7e568c2 100644 --- a/lib/qcd/action/fermion/StaggeredKernels.cc +++ b/lib/qcd/action/fermion/StaggeredKernels.cc @@ -245,10 +245,9 @@ void StaggeredKernels::DhopSite(StencilImpl &st, LebesgueOrder &lo, Double case OptInlineAsm: if ( interior && exterior ) { DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag); - } else if ( interior ) { - DhopSiteAsmInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag); - } else if ( exterior ) { - DhopSiteAsmExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag); + } else { + std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"< Date: Thu, 22 Feb 2018 12:50:09 +0000 Subject: [PATCH 04/25] OverlappedComm for Staggered 5D and 4D. --- lib/algorithms/iterative/ConjugateGradient.h | 8 +- lib/qcd/action/fermion/FermionOperatorImpl.h | 73 +++++----- .../fermion/ImprovedStaggeredFermion.cc | 126 ++++++++++++++++-- .../action/fermion/ImprovedStaggeredFermion.h | 16 ++- .../fermion/ImprovedStaggeredFermion5D.cc | 67 +++++++++- .../fermion/ImprovedStaggeredFermion5D.h | 23 +++- tests/solver/Test_staggered_block_cg_prec.cc | 5 +- .../solver/Test_staggered_block_cg_unprec.cc | 7 +- tests/solver/Test_staggered_cg_prec.cc | 5 +- tests/solver/Test_staggered_cg_schur.cc | 5 +- tests/solver/Test_staggered_cg_unprec.cc | 5 +- 11 files changed, 269 insertions(+), 71 deletions(-) diff --git a/lib/algorithms/iterative/ConjugateGradient.h b/lib/algorithms/iterative/ConjugateGradient.h index 0d4e51c7..e1c796cd 100644 --- a/lib/algorithms/iterative/ConjugateGradient.h +++ b/lib/algorithms/iterative/ConjugateGradient.h @@ -54,6 +54,7 @@ class ConjugateGradient : public OperatorFunction { void operator()(LinearOperatorBase &Linop, const Field &src, Field &psi) { + psi.checkerboard = src.checkerboard; conformable(psi, src); @@ -101,7 +102,7 @@ class ConjugateGradient : public OperatorFunction { SolverTimer.Start(); int k; - for (k = 1; k <= MaxIterations; k++) { + for (k = 1; k <= MaxIterations*1000; k++) { c = cp; MatrixTimer.Start(); @@ -109,8 +110,9 @@ class ConjugateGradient : public OperatorFunction { MatrixTimer.Stop(); LinalgTimer.Start(); - // RealD qqck = norm2(mmp); - // ComplexD dck = innerProduct(p,mmp); + // AA + // RealD qqck = norm2(mmp); + // ComplexD dck = innerProduct(p,mmp); a = c / d; b_pred = a * (a * qq - d) / c; diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h index 9d24deb2..42f222ac 100644 --- a/lib/qcd/action/fermion/FermionOperatorImpl.h +++ b/lib/qcd/action/fermion/FermionOperatorImpl.h @@ -689,7 +689,12 @@ class StaggeredImpl : public PeriodicGaugeImpl(U_ds, U, mu); + } inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &UUUds, // for Naik term DoubledGaugeField &Uds, @@ -728,8 +733,10 @@ class StaggeredImpl : public PeriodicGaugeImpl(Uds, U, mu); - PokeIndex(Uds, Udag, mu + 4); + InsertGaugeField(Uds,U,mu); + InsertGaugeField(Uds,Udag,mu+4); + // PokeIndex(Uds, U, mu); + // PokeIndex(Uds, Udag, mu + 4); // 3 hop based on thin links. Crazy huh ? U = PeekIndex(Uthin, mu); @@ -741,8 +748,8 @@ class StaggeredImpl : public PeriodicGaugeImpl(UUUds, UUU, mu); - PokeIndex(UUUds, UUUdag, mu+4); + InsertGaugeField(UUUds,UUU,mu); + InsertGaugeField(UUUds,UUUdag,mu+4); } } @@ -834,6 +841,23 @@ class StaggeredImpl : public PeriodicGaugeImpllSites(); lidx++) { + + SiteScalarGaugeLink ScalarU; + SiteDoubledGaugeField ScalarUds; + + std::vector lcoor; + GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); + peekLocalSite(ScalarUds, U_ds, lcoor); + + peekLocalSite(ScalarU, U, lcoor); + ScalarUds(mu) = ScalarU(); + + } + } inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &UUUds, // for Naik term DoubledGaugeField &Uds, @@ -875,23 +899,8 @@ class StaggeredImpl : public PeriodicGaugeImpllSites(); lidx++) { - SiteScalarGaugeLink ScalarU; - SiteDoubledGaugeField ScalarUds; - - std::vector lcoor; - GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); - peekLocalSite(ScalarUds, Uds, lcoor); - - peekLocalSite(ScalarU, U, lcoor); - ScalarUds(mu) = ScalarU(); - - peekLocalSite(ScalarU, Udag, lcoor); - ScalarUds(mu + 4) = ScalarU(); - - pokeLocalSite(ScalarUds, Uds, lcoor); - } + InsertGaugeField(Uds,U,mu); + InsertGaugeField(Uds,Udag,mu+4); // 3 hop based on thin links. Crazy huh ? U = PeekIndex(Uthin, mu); @@ -903,24 +912,8 @@ class StaggeredImpl : public PeriodicGaugeImpllSites(); lidx++) { - - SiteScalarGaugeLink ScalarU; - SiteDoubledGaugeField ScalarUds; - - std::vector lcoor; - GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); - - peekLocalSite(ScalarUds, UUUds, lcoor); - - peekLocalSite(ScalarU, UUU, lcoor); - ScalarUds(mu) = ScalarU(); - - peekLocalSite(ScalarU, UUUdag, lcoor); - ScalarUds(mu + 4) = ScalarU(); - - pokeLocalSite(ScalarUds, UUUds, lcoor); - } + InsertGaugeField(UUUds,UUU,mu); + InsertGaugeField(UUUds,UUUdag,mu+4); } } diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc index 5ce2b335..811e482d 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc @@ -44,6 +44,7 @@ ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, template ImprovedStaggeredFermion::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, + RealD _c1, RealD _c2,RealD _u0, const ImplParams &p) : Kernels(p), _grid(&Fgrid), @@ -62,6 +63,16 @@ ImprovedStaggeredFermion::ImprovedStaggeredFermion(GridCartesian &Fgrid, G UUUmuOdd(&Hgrid) , _tmp(&Hgrid) { + int vol4; + int LLs=1; + c1=_c1; + c2=_c2; + u0=_u0; + vol4= _grid->oSites(); + Stencil.BuildSurfaceList(LLs,vol4); + vol4= _cbgrid->oSites(); + StencilEven.BuildSurfaceList(LLs,vol4); + StencilOdd.BuildSurfaceList(LLs,vol4); } template @@ -69,18 +80,15 @@ ImprovedStaggeredFermion::ImprovedStaggeredFermion(GaugeField &_Uthin, Gau GridRedBlackCartesian &Hgrid, RealD _mass, RealD _c1, RealD _c2,RealD _u0, const ImplParams &p) - : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p) + : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_c2,_u0,p) { - c1=_c1; - c2=_c2; - u0=_u0; ImportGauge(_Uthin,_Ufat); } template -ImprovedStaggeredFermion::ImprovedStaggeredFermion(GaugeField &_Uthin,GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid, +ImprovedStaggeredFermion::ImprovedStaggeredFermion(GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, const ImplParams &p) - : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p) + : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,1.0,1.0,1.0,p) { ImportGaugeSimple(_Utriple,_Ufat); } @@ -374,20 +382,116 @@ void ImprovedStaggeredFermion::DhopInternal(StencilImpl &st, LebesgueOrder DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, - FermionField &out, int dag) { + FermionField &out, int dag) +{ +#ifdef GRID_OMP + if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) + DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); + else +#endif + DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); +} +template +void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + DoubledGaugeField &UUU, + const FermionField &in, + FermionField &out, int dag) +{ +#ifdef GRID_OMP + Compressor compressor; + int len = U._grid->oSites(); + const int LLs = 1; + + st.Prepare(); + st.HaloGather(in,compressor); + st.CommsMergeSHM(compressor); + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // Ugly explicit thread mapping introduced for OPA reasons. + ////////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma omp parallel + { + int tid = omp_get_thread_num(); + int nthreads = omp_get_num_threads(); + int ncomms = CartesianCommunicator::nCommThreads; + if (ncomms == -1) ncomms = 1; + assert(nthreads > ncomms); + + if (tid >= ncomms) { + nthreads -= ncomms; + int ttid = tid - ncomms; + int n = len; + int chunk = n / nthreads; + int rem = n % nthreads; + int myblock, myn; + if (ttid < rem) { + myblock = ttid * chunk + ttid; + myn = chunk+1; + } else { + myblock = ttid*chunk + rem; + myn = chunk; + } + + // do the compute + if (dag == DaggerYes) { + for (int ss = myblock; ss < myblock+myn; ++ss) { + int sU = ss; + // Interior = 1; Exterior = 0; must implement for staggered + Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0); + } + } else { + for (int ss = myblock; ss < myblock+myn; ++ss) { + // Interior = 1; Exterior = 0; + int sU = ss; + Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0); + } + } + } else { + st.CommunicateThreaded(); + } + } + + // First to enter, last to leave timing + st.CommsMerge(compressor); + + if (dag == DaggerYes) { + int sz=st.surface_list.size(); + parallel_for (int ss = 0; ss < sz; ss++) { + int sU = st.surface_list[ss]; + Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1); + } + } else { + int sz=st.surface_list.size(); + parallel_for (int ss = 0; ss < sz; ss++) { + int sU = st.surface_list[ss]; + Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1); + } + } +#else + assert(0); +#endif +} + + +template +void ImprovedStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + DoubledGaugeField &UUU, + const FermionField &in, + FermionField &out, int dag) +{ assert((dag == DaggerNo) || (dag == DaggerYes)); Compressor compressor; st.HaloExchange(in, compressor); if (dag == DaggerYes) { - PARALLEL_FOR_LOOP - for (int sss = 0; sss < in._grid->oSites(); sss++) { + parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out); } } else { - PARALLEL_FOR_LOOP - for (int sss = 0; sss < in._grid->oSites(); sss++) { + parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out); } } diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion.h index 7d1f2996..69d0aef4 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.h +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion.h @@ -105,21 +105,31 @@ class ImprovedStaggeredFermion : public StaggeredKernels, public ImprovedS void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag); + void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, + const FermionField &in, FermionField &out, int dag); + void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, + const FermionField &in, FermionField &out, int dag); // Constructor ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, - RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0, + RealD _c1, RealD _c2,RealD _u0, const ImplParams &p = ImplParams()); - ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid, + ////////////////////////////////////////////////////////////////////////// + // MILC constructor no coefficients; premultiply links by desired scaling + ////////////////////////////////////////////////////////////////////////// + ImprovedStaggeredFermion(GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, const ImplParams &p = ImplParams()); + ////////////////////////////////////////////////////////////////////////// + // A don't initialise the gauge field constructor; largely internal + ////////////////////////////////////////////////////////////////////////// ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, + RealD _c1, RealD _c2,RealD _u0, const ImplParams &p = ImplParams()); - // DoubleStore impl dependent void ImportGaugeSimple(const GaugeField &_Utriple, const GaugeField &_Ufat); void ImportGauge(const GaugeField &_Uthin, const GaugeField &_Ufat); diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc index 2df4c044..e5146d7a 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc @@ -41,8 +41,7 @@ ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, // 5d lattice for DWF. template -ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat, - GridCartesian &FiveDimGrid, +ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid, GridRedBlackCartesian &FiveDimRedBlackGrid, GridCartesian &FourDimGrid, GridRedBlackCartesian &FourDimRedBlackGrid, @@ -121,18 +120,43 @@ ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GaugeField &_Uthin, assert(FiveDimGrid._simd_layout[0] ==1); } - - // Allocate the required comms buffer - ImportGauge(_Uthin,_Ufat); - int LLs = FiveDimGrid._rdimensions[0]; int vol4= FourDimGrid.oSites(); Stencil.BuildSurfaceList(LLs,vol4); vol4=FourDimRedBlackGrid.oSites(); StencilEven.BuildSurfaceList(LLs,vol4); - StencilOdd.BuildSurfaceList(LLs,vol4); + StencilOdd.BuildSurfaceList(LLs,vol4); +} +template +ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + RealD _mass, + RealD _c1,RealD _c2, RealD _u0, + const ImplParams &p) : + ImprovedStaggeredFermion5D(FiveDimGrid,FiveDimRedBlackGrid, + FourDimGrid,FourDimRedBlackGrid, + _mass,_c1,_c2,_u0,p) +{ + ImportGauge(_Uthin,_Ufat); +} +template +ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GaugeField &_Utriple,GaugeField &_Ufat, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + RealD _mass, + const ImplParams &p) : + ImprovedStaggeredFermion5D(FiveDimGrid,FiveDimRedBlackGrid, + FourDimGrid,FourDimRedBlackGrid, + _mass,1.0,1.0,1.0,p) +{ + ImportGaugeSimple(_Utriple,_Ufat); } template @@ -140,6 +164,35 @@ void ImprovedStaggeredFermion5D::ImportGauge(const GaugeField &_Uthin) { ImportGauge(_Uthin,_Uthin); }; +/////////////////////////////////////////////////// +// For MILC use; pass three link U's and 1 link U +/////////////////////////////////////////////////// +template +void ImprovedStaggeredFermion5D::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat) +{ + ///////////////////////////////////////////////////////////////// + // Trivial import; phases and fattening and such like preapplied + ///////////////////////////////////////////////////////////////// + for (int mu = 0; mu < Nd; mu++) { + + auto U = PeekIndex(_Utriple, mu); + Impl::InsertGaugeField(UUUmu,U,mu); + + U = adj( Cshift(U, mu, -3)); + Impl::InsertGaugeField(UUUmu,-U,mu+4); + + U = PeekIndex(_Ufat, mu); + Impl::InsertGaugeField(Umu,U,mu); + + U = adj( Cshift(U, mu, -1)); + Impl::InsertGaugeField(Umu,-U,mu+4); + + } + pickCheckerboard(Even, UmuEven, Umu); + pickCheckerboard(Odd, UmuOdd , Umu); + pickCheckerboard(Even, UUUmuEven,UUUmu); + pickCheckerboard(Odd, UUUmuOdd, UUUmu); +} template void ImprovedStaggeredFermion5D::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat) { diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h index e21142b8..f2fce1c1 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -139,6 +139,15 @@ namespace QCD { // Constructors + // -- No Gauge field + ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + double _mass, + RealD _c1, RealD _c2,RealD _u0, + const ImplParams &p= ImplParams()); + // -- Thin link and fat link, with coefficients ImprovedStaggeredFermion5D(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &FiveDimGrid, @@ -146,12 +155,24 @@ namespace QCD { GridCartesian &FourDimGrid, GridRedBlackCartesian &FourDimRedBlackGrid, double _mass, - RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0, + RealD _c1, RealD _c2,RealD _u0, + const ImplParams &p= ImplParams()); + //////////////////////////////////////////////////////////////////////////////////////////////// + // MILC constructor ; triple links, no rescale factors; must be externally pre multiplied + //////////////////////////////////////////////////////////////////////////////////////////////// + ImprovedStaggeredFermion5D(GaugeField &_Utriple, + GaugeField &_Ufat, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + double _mass, const ImplParams &p= ImplParams()); // DoubleStore void ImportGauge(const GaugeField &_U); void ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat); + void ImportGaugeSimple(const GaugeField &_Uthin,const GaugeField &_Ufat); /////////////////////////////////////////////////////////////// // Data members require to support the functionality diff --git a/tests/solver/Test_staggered_block_cg_prec.cc b/tests/solver/Test_staggered_block_cg_prec.cc index 0076e5a0..98a5074e 100644 --- a/tests/solver/Test_staggered_block_cg_prec.cc +++ b/tests/solver/Test_staggered_block_cg_prec.cc @@ -75,7 +75,10 @@ int main (int argc, char ** argv) LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu); RealD mass=0.003; - ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); + RealD c1=9.0/8.0; + RealD c2=-1.0/24.0; + RealD u0=1.0; + ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0); SchurStaggeredOperator HermOp(Ds); ConjugateGradient CG(1.0e-8,10000); diff --git a/tests/solver/Test_staggered_block_cg_unprec.cc b/tests/solver/Test_staggered_block_cg_unprec.cc index 22051ef6..32a1448c 100644 --- a/tests/solver/Test_staggered_block_cg_unprec.cc +++ b/tests/solver/Test_staggered_block_cg_unprec.cc @@ -74,7 +74,10 @@ int main (int argc, char ** argv) LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu); RealD mass=0.003; - ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); + RealD c1=9.0/8.0; + RealD c2=-1.0/24.0; + RealD u0=1.0; + ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0); MdagMLinearOperator HermOp(Ds); ConjugateGradient CG(1.0e-8,10000); @@ -86,7 +89,7 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << "****************************************************************** "< HermOp4d(Ds4d); FermionField src4d(UGrid); random(pRNG,src4d); FermionField result4d(UGrid); result4d=zero; diff --git a/tests/solver/Test_staggered_cg_prec.cc b/tests/solver/Test_staggered_cg_prec.cc index 6bea97c2..167958ed 100644 --- a/tests/solver/Test_staggered_cg_prec.cc +++ b/tests/solver/Test_staggered_cg_prec.cc @@ -71,7 +71,10 @@ int main (int argc, char ** argv) } RealD mass=0.003; - ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass); + RealD c1=9.0/8.0; + RealD c2=-1.0/24.0; + RealD u0=1.0; + ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0); FermionField res_o(&RBGrid); FermionField src_o(&RBGrid); diff --git a/tests/solver/Test_staggered_cg_schur.cc b/tests/solver/Test_staggered_cg_schur.cc index a5c25b85..03a173f9 100644 --- a/tests/solver/Test_staggered_cg_schur.cc +++ b/tests/solver/Test_staggered_cg_schur.cc @@ -65,7 +65,10 @@ int main (int argc, char ** argv) FermionField resid(&Grid); RealD mass=0.1; - ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass); + RealD c1=9.0/8.0; + RealD c2=-1.0/24.0; + RealD u0=1.0; + ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0); ConjugateGradient CG(1.0e-8,10000); SchurRedBlackStaggeredSolve SchurSolver(CG); diff --git a/tests/solver/Test_staggered_cg_unprec.cc b/tests/solver/Test_staggered_cg_unprec.cc index eb33c004..3fae925a 100644 --- a/tests/solver/Test_staggered_cg_unprec.cc +++ b/tests/solver/Test_staggered_cg_unprec.cc @@ -73,7 +73,10 @@ int main (int argc, char ** argv) } RealD mass=0.1; - ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass); + RealD c1=9.0/8.0; + RealD c2=-1.0/24.0; + RealD u0=1.0; + ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0); MdagMLinearOperator HermOp(Ds); ConjugateGradient CG(1.0e-6,10000); From b9382020813d3872d4397da89dec16f68a950be6 Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Wed, 7 Mar 2018 14:08:43 +0000 Subject: [PATCH 05/25] Overlapped Comm for Wilson DhopInternal --- lib/qcd/action/fermion/WilsonFermion.cc | 90 ++++++++++++++++++++++++- lib/qcd/action/fermion/WilsonFermion.h | 6 ++ lib/qcd/action/fermion/WilsonKernels.h | 41 +++++------ 3 files changed, 114 insertions(+), 23 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonFermion.cc b/lib/qcd/action/fermion/WilsonFermion.cc index 19f9674d..a7c69ac9 100644 --- a/lib/qcd/action/fermion/WilsonFermion.cc +++ b/lib/qcd/action/fermion/WilsonFermion.cc @@ -322,15 +322,98 @@ void WilsonFermion::DhopDirDisp(const FermionField &in, FermionField &out, parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma); } -}; - +} +/*Change starts*/ template void WilsonFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag) { - assert((dag == DaggerNo) || (dag == DaggerYes)); +#ifdef GRID_OMP + if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) + DhopInternalOverlappedComms(st,lo,U,in,out,dag); + else +#endif + DhopInternalSerial(st,lo,U,in,out,dag); +} + +template +void WilsonFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, int dag) { + assert((dag == DaggerNo) || (dag == DaggerYes)); +#ifdef GRID_OMP + Compressor compressor; + int len = U._grid->oSites(); + const int LLs = 1; + + st.Prepare(); + st.HaloGather(in,compressor); + st.CommsMergeSHM(compressor); +#pragma omp parallel + { + int tid = omp_get_thread_num(); + int nthreads = omp_get_num_threads(); + int ncomms = CartesianCommunicator::nCommThreads; + if (ncomms == -1) ncomms = 1; + assert(nthreads > ncomms); + if (tid >= ncomms) { + nthreads -= ncomms; + int ttid = tid - ncomms; + int n = len; + int chunk = n / nthreads; + int rem = n % nthreads; + int myblock, myn; + if (ttid < rem) { + myblock = ttid * chunk + ttid; + myn = chunk+1; + } else { + myblock = ttid*chunk + rem; + myn = chunk; + } + // do the compute + if (dag == DaggerYes) { + + for (int sss = myblock; sss < myblock+myn; ++sss) { + Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); + } + } else { + for (int sss = myblock; sss < myblock+myn; ++sss) { + Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); + } + } //else + + } else { + st.CommunicateThreaded(); + } + + Compressor compressor(dag); + + if (dag == DaggerYes) { + parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { + Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); + } + } else { + parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { + Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); + } + } + + } //pragma +#else + assert(0); +#endif +}; + + +template +void WilsonFermion::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, int dag) { + assert((dag == DaggerNo) || (dag == DaggerYes)); Compressor compressor(dag); st.HaloExchange(in, compressor); @@ -344,6 +427,7 @@ void WilsonFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, } } }; +/*Change ends */ FermOpTemplateInstantiate(WilsonFermion); AdjointFermOpTemplateInstantiate(WilsonFermion); diff --git a/lib/qcd/action/fermion/WilsonFermion.h b/lib/qcd/action/fermion/WilsonFermion.h index 933be732..55433854 100644 --- a/lib/qcd/action/fermion/WilsonFermion.h +++ b/lib/qcd/action/fermion/WilsonFermion.h @@ -115,6 +115,12 @@ class WilsonFermion : public WilsonKernels, public WilsonFermionStatic { void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag); + void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag); + + void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag); + // Constructor WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h index 2cf52660..e84b52f9 100644 --- a/lib/qcd/action/fermion/WilsonKernels.h +++ b/lib/qcd/action/fermion/WilsonKernels.h @@ -53,7 +53,7 @@ template class WilsonKernels : public FermionOperator , public typedef FermionOperator Base; public: - + template typename std::enable_if::type DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, @@ -70,27 +70,27 @@ public: break; #endif case OptHandUnroll: - for (int site = 0; site < Ns; site++) { - for (int s = 0; s < Ls; s++) { - if(interior&&exterior) WilsonKernels::HandDhopSite(st,lo,U,buf,sF,sU,in,out); - else if (interior) WilsonKernels::HandDhopSiteInt(st,lo,U,buf,sF,sU,in,out); - else if (exterior) WilsonKernels::HandDhopSiteExt(st,lo,U,buf,sF,sU,in,out); - sF++; - } - sU++; - } + for (int site = 0; site < Ns; site++) { + for (int s = 0; s < Ls; s++) { + if(interior&&exterior) WilsonKernels::HandDhopSite(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::HandDhopSiteInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::HandDhopSiteExt(st,lo,U,buf,sF,sU,in,out); + sF++; + } + sU++; + } break; case OptGeneric: - for (int site = 0; site < Ns; site++) { - for (int s = 0; s < Ls; s++) { - if(interior&&exterior) WilsonKernels::GenericDhopSite(st,lo,U,buf,sF,sU,in,out); - else if (interior) WilsonKernels::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out); - else if (exterior) WilsonKernels::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out); - else assert(0); - sF++; - } - sU++; - } + for (int site = 0; site < Ns; site++) { + for (int s = 0; s < Ls; s++) { + if(interior&&exterior) WilsonKernels::GenericDhopSite(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out); + else assert(0); + sF++; + } + sU++; + } break; default: assert(0); @@ -200,6 +200,7 @@ private: void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, int sF, int sU, const FermionField &in, FermionField &out); + void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); From 80302e95a84ffcbe9e0975241af575febc07c804 Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Thu, 8 Mar 2018 15:34:03 +0000 Subject: [PATCH 06/25] MILC Interface --- .../fermion/ImprovedStaggeredFermion.cc | 33 ++++++------- .../action/fermion/ImprovedStaggeredFermion.h | 25 +++++----- .../fermion/ImprovedStaggeredFermion5D.cc | 48 ++++++++----------- .../fermion/ImprovedStaggeredFermion5D.h | 33 +++++++------ tests/core/Test_staggered5Dvec.cc | 5 +- 5 files changed, 67 insertions(+), 77 deletions(-) diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc index 811e482d..69a3fdc1 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc @@ -84,15 +84,6 @@ ImprovedStaggeredFermion::ImprovedStaggeredFermion(GaugeField &_Uthin, Gau { ImportGauge(_Uthin,_Ufat); } -template -ImprovedStaggeredFermion::ImprovedStaggeredFermion(GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid, - GridRedBlackCartesian &Hgrid, RealD _mass, - const ImplParams &p) - : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,1.0,1.0,1.0,p) -{ - ImportGaugeSimple(_Utriple,_Ufat); -} - //////////////////////////////////////////////////////////// // Momentum space propagator should be @@ -106,11 +97,6 @@ ImprovedStaggeredFermion::ImprovedStaggeredFermion(GaugeField &_Utriple, G // of above link to implmement fourier based solver. //////////////////////////////////////////////////////////// template -void ImprovedStaggeredFermion::ImportGauge(const GaugeField &_Uthin) -{ - ImportGauge(_Uthin,_Uthin); -}; -template void ImprovedStaggeredFermion::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat) { ///////////////////////////////////////////////////////////////// @@ -133,6 +119,20 @@ void ImprovedStaggeredFermion::ImportGaugeSimple(const GaugeField &_Utripl PokeIndex(Umu, -U, mu+4); } + CopyGaugeCheckerboards(); +} +template +void ImprovedStaggeredFermion::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U) +{ + + Umu = _U; + UUUmu = _UUU; + CopyGaugeCheckerboards(); +} + +template +void ImprovedStaggeredFermion::CopyGaugeCheckerboards(void) +{ pickCheckerboard(Even, UmuEven, Umu); pickCheckerboard(Odd, UmuOdd , Umu); pickCheckerboard(Even, UUUmuEven,UUUmu); @@ -168,10 +168,7 @@ void ImprovedStaggeredFermion::ImportGauge(const GaugeField &_Uthin,const PokeIndex(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4); } - pickCheckerboard(Even, UmuEven, Umu); - pickCheckerboard(Odd, UmuOdd , Umu); - pickCheckerboard(Even, UUUmuEven, UUUmu); - pickCheckerboard(Odd, UUUmuOdd, UUUmu); + CopyGaugeCheckerboards(); } ///////////////////////////// diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion.h index 69d0aef4..c2c82073 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.h +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion.h @@ -110,30 +110,29 @@ class ImprovedStaggeredFermion : public StaggeredKernels, public ImprovedS void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag); - // Constructor + ////////////////////////////////////////////////////////////////////////// + // Grid own interface Constructor + ////////////////////////////////////////////////////////////////////////// ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, RealD _c1, RealD _c2,RealD _u0, const ImplParams &p = ImplParams()); ////////////////////////////////////////////////////////////////////////// - // MILC constructor no coefficients; premultiply links by desired scaling - ////////////////////////////////////////////////////////////////////////// - ImprovedStaggeredFermion(GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid, - GridRedBlackCartesian &Hgrid, RealD _mass, - const ImplParams &p = ImplParams()); - - ////////////////////////////////////////////////////////////////////////// - // A don't initialise the gauge field constructor; largely internal + // MILC constructor no gauge fields ////////////////////////////////////////////////////////////////////////// ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, - RealD _c1, RealD _c2,RealD _u0, + RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0, const ImplParams &p = ImplParams()); // DoubleStore impl dependent - void ImportGaugeSimple(const GaugeField &_Utriple, const GaugeField &_Ufat); - void ImportGauge(const GaugeField &_Uthin, const GaugeField &_Ufat); - void ImportGauge(const GaugeField &_Uthin); + void ImportGauge (const GaugeField &_Uthin ) { assert(0); } + void ImportGauge (const GaugeField &_Uthin ,const GaugeField &_Ufat); + void ImportGaugeSimple(const GaugeField &_UUU ,const GaugeField &_U); + void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U); + DoubledGaugeField &GetU(void) { return Umu ; } ; + DoubledGaugeField &GetUUU(void) { return UUUmu; }; + void CopyGaugeCheckerboards(void); /////////////////////////////////////////////////////////////// // Data members require to support the functionality diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc index e5146d7a..df9f055e 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc @@ -128,7 +128,14 @@ ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GridCartesian StencilEven.BuildSurfaceList(LLs,vol4); StencilOdd.BuildSurfaceList(LLs,vol4); } - +template +void ImprovedStaggeredFermion5D::CopyGaugeCheckerboards(void) +{ + pickCheckerboard(Even, UmuEven, Umu); + pickCheckerboard(Odd, UmuOdd , Umu); + pickCheckerboard(Even, UUUmuEven,UUUmu); + pickCheckerboard(Odd, UUUmuOdd, UUUmu); +} template ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat, GridCartesian &FiveDimGrid, @@ -144,26 +151,7 @@ ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GaugeField &_Uthin, { ImportGauge(_Uthin,_Ufat); } -template -ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GaugeField &_Utriple,GaugeField &_Ufat, - GridCartesian &FiveDimGrid, - GridRedBlackCartesian &FiveDimRedBlackGrid, - GridCartesian &FourDimGrid, - GridRedBlackCartesian &FourDimRedBlackGrid, - RealD _mass, - const ImplParams &p) : - ImprovedStaggeredFermion5D(FiveDimGrid,FiveDimRedBlackGrid, - FourDimGrid,FourDimRedBlackGrid, - _mass,1.0,1.0,1.0,p) -{ - ImportGaugeSimple(_Utriple,_Ufat); -} -template -void ImprovedStaggeredFermion5D::ImportGauge(const GaugeField &_Uthin) -{ - ImportGauge(_Uthin,_Uthin); -}; /////////////////////////////////////////////////// // For MILC use; pass three link U's and 1 link U /////////////////////////////////////////////////// @@ -188,10 +176,17 @@ void ImprovedStaggeredFermion5D::ImportGaugeSimple(const GaugeField &_Utri Impl::InsertGaugeField(Umu,-U,mu+4); } - pickCheckerboard(Even, UmuEven, Umu); - pickCheckerboard(Odd, UmuOdd , Umu); - pickCheckerboard(Even, UUUmuEven,UUUmu); - pickCheckerboard(Odd, UUUmuOdd, UUUmu); + CopyGaugeCheckerboards(); +} +template +void ImprovedStaggeredFermion5D::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U) +{ + ///////////////////////////////////////////////////////////////// + // Trivial import; phases and fattening and such like preapplied + ///////////////////////////////////////////////////////////////// + Umu = _U; + UUUmu = _UUU; + CopyGaugeCheckerboards(); } template void ImprovedStaggeredFermion5D::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat) @@ -221,10 +216,7 @@ void ImprovedStaggeredFermion5D::ImportGauge(const GaugeField &_Uthin,cons PokeIndex(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4); } - pickCheckerboard(Even, UmuEven, Umu); - pickCheckerboard(Odd, UmuOdd , Umu); - pickCheckerboard(Even, UUUmuEven, UUUmu); - pickCheckerboard(Odd, UUUmuOdd, UUUmu); + CopyGaugeCheckerboards(); } template void ImprovedStaggeredFermion5D::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp) diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h index f2fce1c1..d9174ec8 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -139,15 +139,9 @@ namespace QCD { // Constructors - // -- No Gauge field - ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid, - GridRedBlackCartesian &FiveDimRedBlackGrid, - GridCartesian &FourDimGrid, - GridRedBlackCartesian &FourDimRedBlackGrid, - double _mass, - RealD _c1, RealD _c2,RealD _u0, - const ImplParams &p= ImplParams()); - // -- Thin link and fat link, with coefficients + //////////////////////////////////////////////////////////////////////////////////////////////// + // Grid internal interface -- Thin link and fat link, with coefficients + //////////////////////////////////////////////////////////////////////////////////////////////// ImprovedStaggeredFermion5D(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &FiveDimGrid, @@ -160,19 +154,24 @@ namespace QCD { //////////////////////////////////////////////////////////////////////////////////////////////// // MILC constructor ; triple links, no rescale factors; must be externally pre multiplied //////////////////////////////////////////////////////////////////////////////////////////////// - ImprovedStaggeredFermion5D(GaugeField &_Utriple, - GaugeField &_Ufat, - GridCartesian &FiveDimGrid, + ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid, GridRedBlackCartesian &FiveDimRedBlackGrid, GridCartesian &FourDimGrid, GridRedBlackCartesian &FourDimRedBlackGrid, double _mass, + RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0, const ImplParams &p= ImplParams()); - - // DoubleStore - void ImportGauge(const GaugeField &_U); - void ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat); - void ImportGaugeSimple(const GaugeField &_Uthin,const GaugeField &_Ufat); + + // DoubleStore gauge field in operator + void ImportGauge (const GaugeField &_Uthin ) { assert(0); } + void ImportGauge (const GaugeField &_Uthin ,const GaugeField &_Ufat); + void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U); + void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U); + // Give a reference; can be used to do an assignment or copy back out after import + // if Carleton wants to cache them and not use the ImportSimple + DoubledGaugeField &GetU(void) { return Umu ; } ; + DoubledGaugeField &GetUUU(void) { return UUUmu; }; + void CopyGaugeCheckerboards(void); /////////////////////////////////////////////////////////////// // Data members require to support the functionality diff --git a/tests/core/Test_staggered5Dvec.cc b/tests/core/Test_staggered5Dvec.cc index db76f792..2720c24c 100644 --- a/tests/core/Test_staggered5Dvec.cc +++ b/tests/core/Test_staggered5Dvec.cc @@ -141,6 +141,7 @@ int main (int argc, char ** argv) t1=usecond(); std::cout< Date: Thu, 26 Apr 2018 10:03:57 +0100 Subject: [PATCH 08/25] Faster reductions, important on single node staggered --- lib/lattice/Lattice_arith.h | 12 +----- lib/lattice/Lattice_reduction.h | 72 ++++++++++++++++++++++++++++----- 2 files changed, 64 insertions(+), 20 deletions(-) diff --git a/lib/lattice/Lattice_arith.h b/lib/lattice/Lattice_arith.h index c3093167..d1cbc84a 100644 --- a/lib/lattice/Lattice_arith.h +++ b/lib/lattice/Lattice_arith.h @@ -244,19 +244,11 @@ namespace Grid { template strong_inline RealD axpy_norm(Lattice &ret,sobj a,const Lattice &x,const Lattice &y){ - ret.checkerboard = x.checkerboard; - conformable(ret,x); - conformable(x,y); - axpy(ret,a,x,y); - return norm2(ret); + return axpy_norm_fast(ret,a,x,y); } template strong_inline RealD axpby_norm(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice &y){ - ret.checkerboard = x.checkerboard; - conformable(ret,x); - conformable(x,y); - axpby(ret,a,b,x,y); - return norm2(ret); // FIXME implement parallel norm in ss loop + return axpby_norm_fast(ret,a,b,x,y); } } diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h index 8a3fbece..7e169baf 100644 --- a/lib/lattice/Lattice_reduction.h +++ b/lib/lattice/Lattice_reduction.h @@ -33,7 +33,7 @@ namespace Grid { // Deterministic Reduction operations //////////////////////////////////////////////////////////////////////////////////////////////////// template inline RealD norm2(const Lattice &arg){ - ComplexD nrm = innerProduct(arg,arg); + auto nrm = innerProduct(arg,arg); return std::real(nrm); } @@ -43,12 +43,12 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ { typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_typeD vector_type; - scalar_type nrm; - GridBase *grid = left._grid; - - std::vector > sumarray(grid->SumArraySize()); - + const int pad = 8; + + scalar_type nrm; + std::vector > sumarray(grid->SumArraySize()*pad); + parallel_for(int thr=0;thrSumArraySize();thr++){ int nwork, mywork, myoff; GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); @@ -57,17 +57,69 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ for(int ss=myoff;ssSumArraySize();i++){ - vvnrm = vvnrm+sumarray[i]; + nrm = nrm+sumarray[i*pad]; } - nrm = Reduce(vvnrm);// sum across simd right._grid->GlobalSum(nrm); return nrm; } + +///////////////////////// +// Fast axpby_norm +// z = a x + b y +// return norm z +///////////////////////// +template strong_inline RealD +axpy_norm_fast(Lattice &z,sobj a,const Lattice &x,const Lattice &y) +{ + sobj one(1.0); + return axpby_norm_fast(z,a,one,x,y); +} + +template strong_inline RealD +axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Lattice &y) +{ + const int pad = 8; + z.checkerboard = x.checkerboard; + conformable(z,x); + conformable(x,y); + + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_typeD vector_type; + RealD nrm; + + GridBase *grid = x._grid; + + Vector sumarray(grid->SumArraySize()*pad); + + parallel_for(int thr=0;thrSumArraySize();thr++){ + int nwork, mywork, myoff; + GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff); + + // private to thread; sub summation + decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero; + for(int ss=myoff;ssSumArraySize();i++){ + nrm = nrm+sumarray[i*pad]; + } + z._grid->GlobalSum(nrm); + return nrm; +} + template inline auto sum(const LatticeUnaryExpression & expr) From 3e125c5b6172c83687ac050d7ba5373b4aa5e9ef Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Thu, 26 Apr 2018 10:07:19 +0100 Subject: [PATCH 09/25] Faster linalg on CG optimised against staggered Sum overhead is bigger for staggered --- lib/algorithms/iterative/ConjugateGradient.h | 31 ++++++++++++-------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/lib/algorithms/iterative/ConjugateGradient.h b/lib/algorithms/iterative/ConjugateGradient.h index e1c796cd..ff4ba8ac 100644 --- a/lib/algorithms/iterative/ConjugateGradient.h +++ b/lib/algorithms/iterative/ConjugateGradient.h @@ -70,7 +70,6 @@ class ConjugateGradient : public OperatorFunction { Linop.HermOpAndNorm(psi, mmp, d, b); - r = src - mmp; p = r; @@ -97,6 +96,9 @@ class ConjugateGradient : public OperatorFunction { << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl; GridStopWatch LinalgTimer; + GridStopWatch InnerTimer; + GridStopWatch AxpyNormTimer; + GridStopWatch LinearCombTimer; GridStopWatch MatrixTimer; GridStopWatch SolverTimer; @@ -106,30 +108,32 @@ class ConjugateGradient : public OperatorFunction { c = cp; MatrixTimer.Start(); - Linop.HermOpAndNorm(p, mmp, d, qq); + Linop.HermOp(p, mmp); MatrixTimer.Stop(); LinalgTimer.Start(); - // AA - // RealD qqck = norm2(mmp); - // ComplexD dck = innerProduct(p,mmp); + InnerTimer.Start(); + ComplexD dc = innerProduct(p,mmp); + InnerTimer.Stop(); + d = dc.real(); a = c / d; - b_pred = a * (a * qq - d) / c; + AxpyNormTimer.Start(); cp = axpy_norm(r, -a, mmp, r); + AxpyNormTimer.Stop(); b = cp / c; - // Fuse these loops ; should be really easy - psi = a * p + psi; - p = p * b + r; - + LinearCombTimer.Start(); + parallel_for(int ss=0;ssoSites();ss++){ + vstream(psi[ss], a * p[ss] + psi[ss]); + vstream(p [ss], b * p[ss] + r[ss]); + } + LinearCombTimer.Stop(); LinalgTimer.Stop(); std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k << " residual " << cp << " target " << rsq << std::endl; - std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl; - std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl; // Stopping condition if (cp <= rsq) { @@ -150,6 +154,9 @@ class ConjugateGradient : public OperatorFunction { std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() < Date: Thu, 26 Apr 2018 10:08:05 +0100 Subject: [PATCH 10/25] Improvements to staggered tests timings --- tests/solver/Test_staggered_block_cg_prec.cc | 58 +++++++++++++++++-- .../solver/Test_staggered_block_cg_unprec.cc | 47 ++++++++++++++- tests/solver/Test_staggered_cg_prec.cc | 12 ++++ 3 files changed, 111 insertions(+), 6 deletions(-) diff --git a/tests/solver/Test_staggered_block_cg_prec.cc b/tests/solver/Test_staggered_block_cg_prec.cc index 98a5074e..82052684 100644 --- a/tests/solver/Test_staggered_block_cg_prec.cc +++ b/tests/solver/Test_staggered_block_cg_prec.cc @@ -74,6 +74,11 @@ int main (int argc, char ** argv) LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu); + double volume=1; + for(int mu=0;mu HermOp4d(Ds4d); FermionField src4d(UGrid); random(pRNG,src4d); FermionField src4d_o(UrbGrid); pickCheckerboard(Odd,src4d_o,src4d); FermionField result4d_o(UrbGrid); + double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 + == 1146 result4d_o=zero; - CG(HermOp4d,src4d_o,result4d_o); + { + double t1=usecond(); + CG(HermOp4d,src4d_o,result4d_o); + double t2=usecond(); + double ncall=CG.IterationsToComplete; + double flops = deodoe_flops * ncall; + std::cout< HermOp(Ds); @@ -93,7 +99,19 @@ int main (int argc, char ** argv) MdagMLinearOperator HermOp4d(Ds4d); FermionField src4d(UGrid); random(pRNG,src4d); FermionField result4d(UGrid); result4d=zero; - CG(HermOp4d,src4d,result4d); + + double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 + == 1146 + { + double t1=usecond(); + CG(HermOp4d,src4d,result4d); + double t2=usecond(); + double ncall=CG.IterationsToComplete; + double flops = deodoe_flops * ncall; + std::cout< HermOpEO(Ds); ConjugateGradient CG(1.0e-8,10000); + double t1=usecond(); CG(HermOpEO,src_o,res_o); + double t2=usecond(); + + // Schur solver: uses DeoDoe => volume * 1146 + double ncall=CG.IterationsToComplete; + double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146 + + std::cout<GlobalSum(DhopComputeTime); + DhopComputeTime/=NP; + + RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl; + + RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; + + std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil" < +void ImprovedStaggeredFermion::ZeroCounters(void) +{ + DhopCalls = 0; + DhopTotalTime = 0; + DhopCommTime = 0; + DhopComputeTime = 0; + DhopFaceTime = 0; + + Stencil.ZeroCounters(); + StencilEven.ZeroCounters(); + StencilOdd.ZeroCounters(); +} + + FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion); //AdjointFermOpTemplateInstantiate(ImprovedStaggeredFermion); diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion.h index 69d0aef4..750d29c6 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.h +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion.h @@ -49,6 +49,18 @@ class ImprovedStaggeredFermion : public StaggeredKernels, public ImprovedS FermionField _tmp; FermionField &tmp(void) { return _tmp; } + //////////////////////////////////////// + // Performance monitoring + //////////////////////////////////////// + void Report(void); + void ZeroCounters(void); + double DhopTotalTime; + double DhopCalls; + double DhopCommTime; + double DhopComputeTime; + double DhopComputeTime2; + double DhopFaceTime; + /////////////////////////////////////////////////////////////// // Implement the abstract base /////////////////////////////////////////////////////////////// @@ -142,7 +154,8 @@ class ImprovedStaggeredFermion : public StaggeredKernels, public ImprovedS // protected: public: // any other parameters of action ??? - + virtual int isTrivialEE(void) { return 1; }; + virtual RealD Mass(void) { return mass; } RealD mass; RealD u0; RealD c1; diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc index e5146d7a..ab9c9c48 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc @@ -291,14 +291,12 @@ void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOr DoubledGaugeField & U,DoubledGaugeField & UUU, const FermionField &in, FermionField &out,int dag) { - DhopTotalTime-=usecond(); #ifdef GRID_OMP if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); else #endif DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); - DhopTotalTime+=usecond(); } template @@ -412,6 +410,7 @@ void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, + //double t1=usecond(); DhopTotalTime -= usecond(); DhopCommTime -= usecond(); st.HaloExchange(in,compressor); @@ -432,6 +431,12 @@ void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, } DhopComputeTime += usecond(); DhopTotalTime += usecond(); + //double t2=usecond(); + //std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl; + //std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl; + //std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl; + //std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl; + } /*CHANGE END*/ diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h index f2fce1c1..4024b472 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -178,6 +178,9 @@ namespace QCD { // Data members require to support the functionality /////////////////////////////////////////////////////////////// public: + + virtual int isTrivialEE(void) { return 1; }; + virtual RealD Mass(void) { return mass; } GridBase *_FourDimGrid; GridBase *_FourDimRedBlackGrid; diff --git a/lib/qcd/action/fermion/WilsonFermion.h b/lib/qcd/action/fermion/WilsonFermion.h index 55433854..d0181d68 100644 --- a/lib/qcd/action/fermion/WilsonFermion.h +++ b/lib/qcd/action/fermion/WilsonFermion.h @@ -135,6 +135,8 @@ class WilsonFermion : public WilsonKernels, public WilsonFermionStatic { // protected: public: + virtual RealD Mass(void) { return mass; } + virtual int isTrivialEE(void) { return 1; }; RealD mass; GridBase *_grid; From 1c64ee926edf9472c6a8bcbd961463cab8c0628c Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Thu, 26 Apr 2018 10:17:49 +0100 Subject: [PATCH 13/25] Faster staggered operator with m^2 term trivial used --- lib/algorithms/LinearOperator.h | 65 ++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 21 deletions(-) diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h index 26746e6e..c9410410 100644 --- a/lib/algorithms/LinearOperator.h +++ b/lib/algorithms/LinearOperator.h @@ -51,7 +51,7 @@ namespace Grid { virtual void Op (const Field &in, Field &out) = 0; // Abstract base virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base - virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0; + virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2); virtual void HermOp(const Field &in, Field &out)=0; }; @@ -305,36 +305,59 @@ namespace Grid { class SchurStaggeredOperator : public SchurOperatorBase { protected: Matrix &_Mat; + Field tmp; + RealD mass; + double tMpc; + double tIP; + double tMeo; + double taxpby_norm; + uint64_t ncall; public: - SchurStaggeredOperator (Matrix &Mat): _Mat(Mat){}; + void Report(void) + { + std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "< Date: Thu, 26 Apr 2018 10:33:19 +0100 Subject: [PATCH 14/25] Merge staggered fix linear operator and reduction --- lib/algorithms/LinearOperator.h | 2 +- lib/lattice/Lattice_reduction.h | 19 ++++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h index c9410410..96b1ed1a 100644 --- a/lib/algorithms/LinearOperator.h +++ b/lib/algorithms/LinearOperator.h @@ -51,7 +51,7 @@ namespace Grid { virtual void Op (const Field &in, Field &out) = 0; // Abstract base virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base - virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2); + virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2) = 0; virtual void HermOp(const Field &in, Field &out)=0; }; diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h index 7e169baf..3be4b6cb 100644 --- a/lib/lattice/Lattice_reduction.h +++ b/lib/lattice/Lattice_reduction.h @@ -46,28 +46,29 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ GridBase *grid = left._grid; const int pad = 8; - scalar_type nrm; - std::vector > sumarray(grid->SumArraySize()*pad); + ComplexD inner; + Vector sumarray(grid->SumArraySize()*pad); parallel_for(int thr=0;thrSumArraySize();thr++){ int nwork, mywork, myoff; GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); - decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation + decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation for(int ss=myoff;ssSumArraySize();i++){ - nrm = nrm+sumarray[i*pad]; + inner = inner+sumarray[i*pad]; } - right._grid->GlobalSum(nrm); - return nrm; + right._grid->GlobalSum(inner); + return inner; } ///////////////////////// From 441ad7498dcec9d88ec8d38c405fa8c8de5260af Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Wed, 2 May 2018 14:21:30 +0100 Subject: [PATCH 15/25] add Iterative counter --- lib/algorithms/iterative/ConjugateGradientMultiShift.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/algorithms/iterative/ConjugateGradientMultiShift.h b/lib/algorithms/iterative/ConjugateGradientMultiShift.h index a9ccfd2c..1ad79856 100644 --- a/lib/algorithms/iterative/ConjugateGradientMultiShift.h +++ b/lib/algorithms/iterative/ConjugateGradientMultiShift.h @@ -43,6 +43,7 @@ namespace Grid { public: RealD Tolerance; Integer MaxIterations; + Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion int verbose; MultiShiftFunction shifts; @@ -269,6 +270,9 @@ void operator() (LinearOperatorBase &Linop, const Field &src, std::vector RealD cn = norm2(src); std::cout< Date: Wed, 2 May 2018 14:22:37 +0100 Subject: [PATCH 16/25] MultiShift for Staggered --- tests/solver/Test_staggered_multishift.cc | 121 ++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 tests/solver/Test_staggered_multishift.cc diff --git a/tests/solver/Test_staggered_multishift.cc b/tests/solver/Test_staggered_multishift.cc new file mode 100644 index 00000000..04386027 --- /dev/null +++ b/tests/solver/Test_staggered_multishift.cc @@ -0,0 +1,121 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_wilson_cg_unprec.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + +int main (int argc, char ** argv) +{ + typedef typename ImprovedStaggeredFermionR::FermionField FermionField; + typename ImprovedStaggeredFermionR::ImplParams params; + + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian RBGrid(&Grid); + + std::vector seeds({1,2,3,4}); + GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); + + + LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + + double volume=1; + for(int mu=0;mu HermOpEO(Ds); + + FermionField src(&Grid); random(pRNG,src); + FermionField src_o(&RBGrid); + pickCheckerboard(Odd,src_o,src); + + + ///////////////////////////////// + //Multishift CG + ///////////////////////////////// + std::vector result(degree,&RBGrid); + ConjugateGradientMultiShift MSCG(10000,Sqrt); + + double deodoe_flops=(1205+15*degree)*volume; // == 66*16 + == 1146 + + double t1=usecond(); + MSCG(HermOpEO,src_o,result); + double t2=usecond(); + double ncall=MSCG.IterationsToComplete; + double flops = deodoe_flops * ncall; + std::cout< &Linop, const Field &src, std::vector } } } + AXPYTimer.Stop(); cp=c; - + MatrixTimer.Start(); Linop.HermOpAndNorm(p,mmp,d,qq); + MatrixTimer.Stop(); + + AXPYTimer.Start(); axpy(mmp,mass[0],p,mmp); + AXPYTimer.Stop(); RealD rn = norm2(p); d += rn*mass[0]; bp=b; b=-cp/d; + AXPYTimer.Start(); c=axpy_norm(r,b,mmp,r); + AXPYTimer.Stop(); // Toggle the recurrence history bs[0] = b; iz = 1-iz; + ShiftTimer.Start(); for(int s=1;s &Linop, const Field &src, std::vector bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike } } + ShiftTimer.Stop(); for(int s=0;s &Linop, const Field &src, std::vector // Before: 3 x npole + 3 x npole // After : 2 x npole (ps[s]) => 3x speed up of multishift CG. + AXPYTimer.Start(); if( (!converged[s]) ) { axpy(psi[ss],-bs[s]*alpha[s],ps[s],psi[ss]); } + AXPYTimer.Stop(); } // Convergence checks @@ -258,6 +281,9 @@ void operator() (LinearOperatorBase &Linop, const Field &src, std::vector if ( all_converged ){ + SolverTimer.Stop(); + + std::cout< &Linop, const Field &src, std::vector std::cout< Date: Fri, 4 May 2018 10:58:01 +0100 Subject: [PATCH 18/25] Add timing --- lib/algorithms/iterative/ConjugateGradientMultiShift.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/algorithms/iterative/ConjugateGradientMultiShift.h b/lib/algorithms/iterative/ConjugateGradientMultiShift.h index 28a80ce6..9d2719fc 100644 --- a/lib/algorithms/iterative/ConjugateGradientMultiShift.h +++ b/lib/algorithms/iterative/ConjugateGradientMultiShift.h @@ -207,7 +207,10 @@ void operator() (LinearOperatorBase &Linop, const Field &src, std::vector cp=c; MatrixTimer.Start(); - Linop.HermOpAndNorm(p,mmp,d,qq); + //Linop.HermOpAndNorm(p,mmp,d,qq); // d is used + Linop.HermOp(p,mmp); + d=real(innerProduct(p,mmp)); + MatrixTimer.Stop(); AXPYTimer.Start(); @@ -253,11 +256,9 @@ void operator() (LinearOperatorBase &Linop, const Field &src, std::vector // Before: 3 x npole + 3 x npole // After : 2 x npole (ps[s]) => 3x speed up of multishift CG. - AXPYTimer.Start(); if( (!converged[s]) ) { axpy(psi[ss],-bs[s]*alpha[s],ps[s],psi[ss]); } - AXPYTimer.Stop(); } // Convergence checks From 3c7a4106ed20d92f4352cdb6e3394c2a8762ba44 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Mon, 7 May 2018 17:26:39 +0100 Subject: [PATCH 19/25] Trap for deadly empty comm thread option --- lib/util/Init.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/util/Init.cc b/lib/util/Init.cc index 45a37a02..7dc8230e 100644 --- a/lib/util/Init.cc +++ b/lib/util/Init.cc @@ -385,6 +385,7 @@ void Grid_init(int *argc,char ***argv) if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){ arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads"); GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads); + assert(CartesianCommunicator::nCommThreads > 0); } if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); From c24d53bbd19d1e55370de9d6f15b7542cbe46ac8 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Mon, 7 May 2018 18:55:05 +0100 Subject: [PATCH 20/25] Further debug of RNG I/O --- lib/parallelIO/BinaryIO.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h index 08b7b9b4..a60fe962 100644 --- a/lib/parallelIO/BinaryIO.h +++ b/lib/parallelIO/BinaryIO.h @@ -431,14 +431,20 @@ PARALLEL_CRITICAL MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0); } - std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl; + std::cout << GridLogDebug << "MPI write I/O set view " << file << std::endl; ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr == 0); - std::cout << GridLogDebug << "MPI read I/O write all " << file << std::endl; + std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl; ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status); assert(ierr == 0); + MPI_Offset os; + MPI_File_get_position(fh, &os); + MPI_File_get_byte_offset(fh, os, &disp); + offset = disp; + + MPI_File_close(&fh); MPI_Type_free(&fileArray); MPI_Type_free(&localArray); @@ -448,7 +454,7 @@ PARALLEL_CRITICAL } else { std::cout << GridLogMessage << "IOobject: C++ write I/O " << file << " : " - << iodata.size() * sizeof(fobj) << " bytes" << std::endl; + << iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl; std::ofstream fout; fout.exceptions ( std::fstream::failbit | std::fstream::badbit ); From f871fb0c6dbd46dcc7ebb296bfd6297a5178b440 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Fri, 11 May 2018 18:06:28 +0100 Subject: [PATCH 21/25] check file is opened correctly in the Lime reader --- lib/parallelIO/IldgIO.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/parallelIO/IldgIO.h b/lib/parallelIO/IldgIO.h index 90c05546..33af6c3d 100644 --- a/lib/parallelIO/IldgIO.h +++ b/lib/parallelIO/IldgIO.h @@ -182,6 +182,11 @@ class GridLimeReader : public BinaryIO { { filename= _filename; File = fopen(filename.c_str(), "r"); + if (File == nullptr) + { + std::cerr << "cannot open file '" << filename << "'" << std::endl; + abort(); + } LimeR = limeCreateReader(File); } ///////////////////////////////////////////// From a61e0df54b8836337a7b388620e62d57c502c953 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Mon, 14 May 2018 19:56:12 +0100 Subject: [PATCH 22/25] Travis fix for Lime --- .travis.yml | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index ad4e5b73..55f7c097 100644 --- a/.travis.yml +++ b/.travis.yml @@ -19,6 +19,8 @@ before_install: - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi install: + - export CWD=`pwd` + - echo $CWD - export CC=$CC$VERSION - export CXX=$CXX$VERSION - echo $PATH @@ -36,11 +38,22 @@ script: - ./bootstrap.sh - mkdir build - cd build - - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none + - mkdir lime + - cd lime + - mkdir build + - cd build + - wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz + - tar xf lime-1.3.2.tar.gz + - cd lime-1.3.2 + - ./configure --prefix=$CWD/build/lime/install + - make -j4 + - make install + - cd $CWD/build + - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install - make -j4 - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - echo make clean - - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none + - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install - make -j4 - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - make check From 68c028b0a6039fe9504ab2c77ab4635e5241dc34 Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Mon, 21 May 2018 12:54:25 +0100 Subject: [PATCH 23/25] Comment --- lib/algorithms/iterative/ConjugateGradientMultiShift.h | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/algorithms/iterative/ConjugateGradientMultiShift.h b/lib/algorithms/iterative/ConjugateGradientMultiShift.h index 9d2719fc..4c311bc1 100644 --- a/lib/algorithms/iterative/ConjugateGradientMultiShift.h +++ b/lib/algorithms/iterative/ConjugateGradientMultiShift.h @@ -208,6 +208,7 @@ void operator() (LinearOperatorBase &Linop, const Field &src, std::vector cp=c; MatrixTimer.Start(); //Linop.HermOpAndNorm(p,mmp,d,qq); // d is used + // The below is faster on KNL Linop.HermOp(p,mmp); d=real(innerProduct(p,mmp)); From 0e127b1fc788c0d343868d99714f8d286b451801 Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Mon, 21 May 2018 12:57:13 +0100 Subject: [PATCH 24/25] New file single prec test --- tests/core/Test_staggered5DvecF.cc | 196 +++++++++++++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 tests/core/Test_staggered5DvecF.cc diff --git a/tests/core/Test_staggered5DvecF.cc b/tests/core/Test_staggered5DvecF.cc new file mode 100644 index 00000000..5d421673 --- /dev/null +++ b/tests/core/Test_staggered5DvecF.cc @@ -0,0 +1,196 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./benchmarks/Benchmark_wilson.cc + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + + const int Ls=16; + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::cout << GridLogMessage << "Making s innermost grids"< seeds({1,2,3,4}); + + GridParallelRNG pRNG4(UGrid); + GridParallelRNG pRNG5(FGrid); + pRNG4.SeedFixedIntegers(seeds); + pRNG5.SeedFixedIntegers(seeds); + + typedef typename ImprovedStaggeredFermion5DF::FermionField FermionField; + typedef typename ImprovedStaggeredFermion5DF::ComplexField ComplexField; + typename ImprovedStaggeredFermion5DF::ImplParams params; + + FermionField src (FGrid); + random(pRNG5,src); + /* + std::vector site({0,1,2,0,0}); + ColourVector cv = zero; + cv()()(0)=1.0; + src = zero; + pokeSite(cv,src,site); + */ + FermionField result(FGrid); result=zero; + FermionField tmp(FGrid); tmp=zero; + FermionField err(FGrid); tmp=zero; + FermionField phi (FGrid); random(pRNG5,phi); + FermionField chi (FGrid); random(pRNG5,chi); + + LatticeGaugeFieldF Umu(UGrid); + SU3::HotConfiguration(pRNG4,Umu); + + /* + for(int mu=1;mu<4;mu++){ + auto tmp = PeekIndex(Umu,mu); + tmp = zero; + PokeIndex(Umu,tmp,mu); + } + */ + double volume=Ls; + for(int mu=0;mu Date: Mon, 4 Jun 2018 18:34:15 +0100 Subject: [PATCH 25/25] Solve g++ problem on the lanczos test --- lib/algorithms/iterative/ImplicitlyRestartedLanczos.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h index 8011e796..cee566e6 100644 --- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h +++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h @@ -57,8 +57,10 @@ void basisRotate(std::vector &basis,Eigen::MatrixXd& Qt,int j0, int j1, i parallel_region { - std::vector < vobj > B(Nm); // Thread private - + Vector < vobj > B; // Thread private + + PARALLEL_CRITICAL { B.resize(Nm); } + parallel_for_internal(int ss=0;ss < grid->oSites();ss++){ for(int j=j0; j