From 24162c9eadb6316f2125c2430c51440969cd2310 Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Tue, 9 Jan 2018 13:02:52 +0000 Subject: [PATCH 01/26] Staggered overlap comms comput --- .../fermion/ImprovedStaggeredFermion5D.cc | 164 ++++++ .../fermion/ImprovedStaggeredFermion5D.h | 19 + lib/qcd/action/fermion/StaggeredKernels.cc | 389 +++++++------- lib/qcd/action/fermion/StaggeredKernels.h | 79 ++- lib/qcd/action/fermion/StaggeredKernelsAsm.cc | 106 +++- .../action/fermion/StaggeredKernelsHand.cc | 497 ++++++++++-------- lib/qcd/action/fermion/WilsonCompressor.h | 59 +-- lib/qcd/action/fermion/WilsonFermion5D.cc | 3 +- lib/stencil/Stencil.h | 40 +- lib/util/Init.cc | 2 + 10 files changed, 869 insertions(+), 489 deletions(-) diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc index 7d988d89..2df4c044 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc @@ -124,6 +124,15 @@ ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GaugeField &_Uthin, // Allocate the required comms buffer ImportGauge(_Uthin,_Ufat); + + int LLs = FiveDimGrid._rdimensions[0]; + int vol4= FourDimGrid.oSites(); + Stencil.BuildSurfaceList(LLs,vol4); + + vol4=FourDimRedBlackGrid.oSites(); + StencilEven.BuildSurfaceList(LLs,vol4); + StencilOdd.BuildSurfaceList(LLs,vol4); + } template @@ -223,6 +232,157 @@ void ImprovedStaggeredFermion5D::DhopDerivOE(GaugeField &mat, assert(0); } +/*CHANGE */ +template +void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, + DoubledGaugeField & U,DoubledGaugeField & UUU, + const FermionField &in, FermionField &out,int dag) +{ + DhopTotalTime-=usecond(); +#ifdef GRID_OMP + if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) + DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); + else +#endif + DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); + DhopTotalTime+=usecond(); +} + +template +void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo, + DoubledGaugeField & U,DoubledGaugeField & UUU, + const FermionField &in, FermionField &out,int dag) +{ +#ifdef GRID_OMP + // assert((dag==DaggerNo) ||(dag==DaggerYes)); + + Compressor compressor; + + int LLs = in._grid->_rdimensions[0]; + int len = U._grid->oSites(); + + DhopFaceTime-=usecond(); + st.Prepare(); + st.HaloGather(in,compressor); + // st.HaloExchangeOptGather(in,compressor); // Wilson compressor + st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms + DhopFaceTime+=usecond(); + + double ctime=0; + double ptime=0; + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // Ugly explicit thread mapping introduced for OPA reasons. + ////////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma omp parallel reduction(max:ctime) reduction(max:ptime) + { + int tid = omp_get_thread_num(); + int nthreads = omp_get_num_threads(); + int ncomms = CartesianCommunicator::nCommThreads; + if (ncomms == -1) ncomms = 1; + assert(nthreads > ncomms); + if (tid >= ncomms) { + double start = usecond(); + nthreads -= ncomms; + int ttid = tid - ncomms; + int n = U._grid->oSites(); // 4d vol + int chunk = n / nthreads; + int rem = n % nthreads; + int myblock, myn; + if (ttid < rem) { + myblock = ttid * chunk + ttid; + myn = chunk+1; + } else { + myblock = ttid*chunk + rem; + myn = chunk; + } + + // do the compute + if (dag == DaggerYes) { + for (int ss = myblock; ss < myblock+myn; ++ss) { + int sU = ss; + // Interior = 1; Exterior = 0; must implement for staggered + Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<--------- + } + } else { + for (int ss = myblock; ss < myblock+myn; ++ss) { + // Interior = 1; Exterior = 0; + int sU = ss; + Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<------------ + } + } + ptime = usecond() - start; + } else { + double start = usecond(); + st.CommunicateThreaded(); + ctime = usecond() - start; + } + } + DhopCommTime += ctime; + DhopComputeTime+=ptime; + + // First to enter, last to leave timing + st.CollateThreads(); + + DhopFaceTime-=usecond(); + st.CommsMerge(compressor); + DhopFaceTime+=usecond(); + + DhopComputeTime2-=usecond(); + if (dag == DaggerYes) { + int sz=st.surface_list.size(); + parallel_for (int ss = 0; ss < sz; ss++) { + int sU = st.surface_list[ss]; + Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1); //<---------- + } + } else { + int sz=st.surface_list.size(); + parallel_for (int ss = 0; ss < sz; ss++) { + int sU = st.surface_list[ss]; + Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1);//<---------- + } + } + DhopComputeTime2+=usecond(); +#else + assert(0); +#endif + +} + +template +void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, + DoubledGaugeField & U,DoubledGaugeField & UUU, + const FermionField &in, FermionField &out,int dag) +{ + Compressor compressor; + int LLs = in._grid->_rdimensions[0]; + + + + DhopTotalTime -= usecond(); + DhopCommTime -= usecond(); + st.HaloExchange(in,compressor); + DhopCommTime += usecond(); + + DhopComputeTime -= usecond(); + // Dhop takes the 4d grid from U, and makes a 5d index for fermion + if (dag == DaggerYes) { + parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { + int sU=ss; + Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out); + } + } else { + parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { + int sU=ss; + Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out); + } + } + DhopComputeTime += usecond(); + DhopTotalTime += usecond(); +} +/*CHANGE END*/ + +/* ORG template void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U,DoubledGaugeField & UUU, @@ -254,6 +414,7 @@ void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOr DhopComputeTime += usecond(); DhopTotalTime += usecond(); } +*/ template @@ -336,6 +497,9 @@ void ImprovedStaggeredFermion5D::ZeroCounters(void) DhopTotalTime = 0; DhopCommTime = 0; DhopComputeTime = 0; + DhopFaceTime = 0; + + Stencil.ZeroCounters(); StencilEven.ZeroCounters(); StencilOdd.ZeroCounters(); diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h index ca1a955a..e21142b8 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -64,6 +64,8 @@ namespace QCD { double DhopCalls; double DhopCommTime; double DhopComputeTime; + double DhopComputeTime2; + double DhopFaceTime; /////////////////////////////////////////////////////////////// // Implement the abstract base @@ -119,6 +121,23 @@ namespace QCD { FermionField &out, int dag); + void DhopInternalOverlappedComms(StencilImpl & st, + LebesgueOrder &lo, + DoubledGaugeField &U, + DoubledGaugeField &UUU, + const FermionField &in, + FermionField &out, + int dag); + + void DhopInternalSerialComms(StencilImpl & st, + LebesgueOrder &lo, + DoubledGaugeField &U, + DoubledGaugeField &UUU, + const FermionField &in, + FermionField &out, + int dag); + + // Constructors ImprovedStaggeredFermion5D(GaugeField &_Uthin, GaugeField &_Ufat, diff --git a/lib/qcd/action/fermion/StaggeredKernels.cc b/lib/qcd/action/fermion/StaggeredKernels.cc index b6ec14c7..ea44cdd8 100644 --- a/lib/qcd/action/fermion/StaggeredKernels.cc +++ b/lib/qcd/action/fermion/StaggeredKernels.cc @@ -32,223 +32,242 @@ namespace Grid { namespace QCD { int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric; +int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute; + +#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \ + SE = st.GetEntry(ptype, Dir+skew, sF); \ + if (SE->_is_local ) { \ + if (SE->_permute) { \ + chi_p = χ \ + permute(chi, in._odata[SE->_offset], ptype); \ + } else { \ + chi_p = &in._odata[SE->_offset]; \ + } \ + } else { \ + chi_p = &buf[SE->_offset]; \ + } \ + multLink(Uchi, U._odata[sU], *chi_p, Dir); + +#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \ + SE = st.GetEntry(ptype, Dir+skew, sF); \ + if (SE->_is_local ) { \ + if (SE->_permute) { \ + chi_p = χ \ + permute(chi, in._odata[SE->_offset], ptype); \ + } else { \ + chi_p = &in._odata[SE->_offset]; \ + } \ + } else if ( st.same_node[Dir] ) { \ + chi_p = &buf[SE->_offset]; \ + } \ + if (SE->_is_local || st.same_node[Dir] ) { \ + multLink(Uchi, U._odata[sU], *chi_p, Dir); \ + } + +#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \ + SE = st.GetEntry(ptype, Dir+skew, sF); \ + if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \ + nmu++; \ + chi_p = &buf[SE->_offset]; \ + multLink(Uchi, U._odata[sU], *chi_p, Dir); \ + } template StaggeredKernels::StaggeredKernels(const ImplParams &p) : Base(p){}; -//////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////// // Generic implementation; move to different file? -//////////////////////////////////////////// - +// Int, Ext, Int+Ext cases for comms overlap +//////////////////////////////////////////////////////////////////////////////////// template -void StaggeredKernels::DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - SiteSpinor *buf, int sF, - int sU, const FermionField &in, SiteSpinor &out,int threeLink) { +void StaggeredKernels::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out, int dag) { const SiteSpinor *chi_p; SiteSpinor chi; SiteSpinor Uchi; StencilEntry *SE; int ptype; - int skew = 0; - if (threeLink) skew=8; - /////////////////////////// - // Xp - /////////////////////////// + int skew; - SE = st.GetEntry(ptype, Xp+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; + for(int s=0;s_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Yp); - - /////////////////////////// - // Zp - /////////////////////////// - SE = st.GetEntry(ptype, Zp+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zp); - - /////////////////////////// - // Tp - /////////////////////////// - SE = st.GetEntry(ptype, Tp+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tp); - - /////////////////////////// - // Xm - /////////////////////////// - SE = st.GetEntry(ptype, Xm+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Xm); - - /////////////////////////// - // Ym - /////////////////////////// - SE = st.GetEntry(ptype, Ym+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Ym); - - /////////////////////////// - // Zm - /////////////////////////// - SE = st.GetEntry(ptype, Zm+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zm); - - /////////////////////////// - // Tm - /////////////////////////// - SE = st.GetEntry(ptype, Tm+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tm); - - vstream(out, Uchi); }; + /////////////////////////////////////////////////// + // Only contributions from interior of our node + /////////////////////////////////////////////////// +template +void StaggeredKernels::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { + const SiteSpinor *chi_p; + SiteSpinor chi; + SiteSpinor Uchi; + StencilEntry *SE; + int ptype; + int skew ; + + for(int s=0;s +void StaggeredKernels::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { + const SiteSpinor *chi_p; + SiteSpinor chi; + SiteSpinor Uchi; + StencilEntry *SE; + int ptype; + int nmu=0; + int skew ; + + for(int s=0;s void StaggeredKernels::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, int sU, - const FermionField &in, FermionField &out) { - SiteSpinor naik; - SiteSpinor naive; - int oneLink =0; - int threeLink=1; + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out, + int interior,int exterior) +{ int dag=1; - switch(Opt) { -#ifdef AVX512 - //FIXME; move the sign into the Asm routine - case OptInlineAsm: - DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out); - for(int s=0;s void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out) + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { assert(0); }; @@ -645,10 +681,9 @@ void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, // This is the single precision 5th direction vectorised kernel #include template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out) + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { #ifdef AVX512 uint64_t gauge0,gauge1,gauge2,gauge3; @@ -685,7 +720,11 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3); addr0 = (uint64_t) &out._odata[sF]; - REDUCE(addr0); + if ( dag ) { + nREDUCE(addr0); + } else { + REDUCE(addr0); + } } #else assert(0); @@ -695,10 +734,9 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl #include template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out) + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { #ifdef AVX512 uint64_t gauge0,gauge1,gauge2,gauge3; @@ -734,7 +772,11 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3); addr0 = (uint64_t) &out._odata[sF]; - REDUCE(addr0); + if ( dag ) { + nREDUCE(addr0); + } else { + REDUCE(addr0); + } } #else assert(0); @@ -776,10 +818,9 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl #include template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out) + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { #ifdef AVX512 uint64_t gauge0,gauge1,gauge2,gauge3; @@ -832,7 +873,11 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, MULT_ADD_XYZT(gauge2,gauge3); addr0 = (uint64_t) &out._odata[sF]; - REDUCEa(addr0); + if ( dag ) { + nREDUCEa(addr0); + } else { + REDUCEa(addr0); + } } #else assert(0); @@ -841,10 +886,9 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, #include template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out) + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { #ifdef AVX512 uint64_t gauge0,gauge1,gauge2,gauge3; @@ -897,7 +941,11 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, MULT_ADD_XYZT(gauge2,gauge3); addr0 = (uint64_t) &out._odata[sF]; - REDUCEa(addr0); + if ( dag ) { + nREDUCEa(addr0); + } else { + REDUCEa(addr0); + } } #else assert(0); @@ -909,7 +957,7 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, DoubledGaugeField &U, \ DoubledGaugeField &UUU, \ SiteSpinor *buf, int LLs, \ - int sU, const FermionField &in, FermionField &out); + int sU, const FermionField &in, FermionField &out,int dag); KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD); KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF); diff --git a/lib/qcd/action/fermion/StaggeredKernelsHand.cc b/lib/qcd/action/fermion/StaggeredKernelsHand.cc index 7de8480c..47ebdd86 100644 --- a/lib/qcd/action/fermion/StaggeredKernelsHand.cc +++ b/lib/qcd/action/fermion/StaggeredKernelsHand.cc @@ -28,7 +28,6 @@ Author: paboyle /* END LEGAL */ #include -#define REGISTER #define LOAD_CHI(b) \ const SiteSpinor & ref (b[offset]); \ @@ -59,7 +58,7 @@ Author: paboyle UChi ## _1 += U_12*Chi_2;\ UChi ## _2 += U_22*Chi_2; -#define MULT_ADD(A,UChi) \ +#define MULT_ADD(U,A,UChi) \ auto & ref(U._odata[sU](A)); \ Impl::loadLinkElement(U_00,ref()(0,0)); \ Impl::loadLinkElement(U_10,ref()(1,0)); \ @@ -82,241 +81,319 @@ Author: paboyle #define PERMUTE_DIR(dir) \ - permute##dir(Chi_0,Chi_0);\ - permute##dir(Chi_1,Chi_1);\ - permute##dir(Chi_2,Chi_2); + permute##dir(Chi_0,Chi_0); \ + permute##dir(Chi_1,Chi_1); \ + permute##dir(Chi_2,Chi_2); + + +#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \ + SE=st.GetEntry(ptype,Dir+skew,sF); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHI(in._odata); \ + if ( perm) { \ + PERMUTE_DIR(Perm); \ + } \ + } else { \ + LOAD_CHI(buf); \ + } + +#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \ + HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \ + { \ + MULT(Dir,even); \ + } + +#define HAND_STENCIL_LEG(U,Dir,Perm,skew,even) \ + HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \ + { \ + MULT_ADD(U,Dir,even); \ + } + + + +#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \ + SE=st.GetEntry(ptype,Dir+skew,sF); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHI(in._odata); \ + if ( perm) { \ + PERMUTE_DIR(Perm); \ + } \ + } else if ( st.same_node[Dir] ) { \ + LOAD_CHI(buf); \ + } \ + if (SE->_is_local || st.same_node[Dir] ) { \ + MULT_ADD(U,Dir,even); \ + } + +#define HAND_STENCIL_LEG_EXT(U,Dir,Perm,skew,even) \ + SE=st.GetEntry(ptype,Dir+skew,sF); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \ + nmu++; \ + { LOAD_CHI(buf); } \ + { MULT_ADD(U,Dir,even); } \ + } namespace Grid { namespace QCD { template -void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out, int dag) -{ - SiteSpinor naik; - SiteSpinor naive; - int oneLink =0; - int threeLink=1; - int skew(0); - Real scale(1.0); - - if(dag) scale = -1.0; - - for(int s=0;s -void StaggeredKernels::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - SiteSpinor *buf, int sF, - int sU, const FermionField &in, SiteSpinor &out,int threeLink) +void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U,DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; - REGISTER Simd even_0; // 12 regs on knc - REGISTER Simd even_1; - REGISTER Simd even_2; - REGISTER Simd odd_0; // 12 regs on knc - REGISTER Simd odd_1; - REGISTER Simd odd_2; + Simd even_0; // 12 regs on knc + Simd even_1; + Simd even_2; + Simd odd_0; // 12 regs on knc + Simd odd_1; + Simd odd_2; - REGISTER Simd Chi_0; // two spinor; 6 regs - REGISTER Simd Chi_1; - REGISTER Simd Chi_2; - - REGISTER Simd U_00; // two rows of U matrix - REGISTER Simd U_10; - REGISTER Simd U_20; - REGISTER Simd U_01; - REGISTER Simd U_11; - REGISTER Simd U_21; // 2 reg left. - REGISTER Simd U_02; - REGISTER Simd U_12; - REGISTER Simd U_22; - - int skew = 0; - if (threeLink) skew=8; + Simd Chi_0; // two spinor; 6 regs + Simd Chi_1; + Simd Chi_2; + + Simd U_00; // two rows of U matrix + Simd U_10; + Simd U_20; + Simd U_01; + Simd U_11; + Simd U_21; // 2 reg left. + Simd U_02; + Simd U_12; + Simd U_22; + SiteSpinor result; int offset,local,perm, ptype; + StencilEntry *SE; + int skew; - // Xp - SE=st.GetEntry(ptype,Xp+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT(Xp,even); - } - - // Yp - SE=st.GetEntry(ptype,Yp+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT(Yp,odd); - } + for(int s=0;s_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + skew = 0; + HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even); + HAND_STENCIL_LEG_BEGIN(Yp,2,skew,odd); + HAND_STENCIL_LEG (U,Zp,1,skew,even); + HAND_STENCIL_LEG (U,Tp,0,skew,odd); + HAND_STENCIL_LEG (U,Xm,3,skew,even); + HAND_STENCIL_LEG (U,Ym,2,skew,odd); + HAND_STENCIL_LEG (U,Zm,1,skew,even); + HAND_STENCIL_LEG (U,Tm,0,skew,odd); + skew = 8; + HAND_STENCIL_LEG(UUU,Xp,3,skew,even); + HAND_STENCIL_LEG(UUU,Yp,2,skew,odd); + HAND_STENCIL_LEG(UUU,Zp,1,skew,even); + HAND_STENCIL_LEG(UUU,Tp,0,skew,odd); + HAND_STENCIL_LEG(UUU,Xm,3,skew,even); + HAND_STENCIL_LEG(UUU,Ym,2,skew,odd); + HAND_STENCIL_LEG(UUU,Zm,1,skew,even); + HAND_STENCIL_LEG(UUU,Tm,0,skew,odd); + + if ( dag ) { + result()()(0) = - even_0 - odd_0; + result()()(1) = - even_1 - odd_1; + result()()(2) = - even_2 - odd_2; + } else { + result()()(0) = even_0 + odd_0; + result()()(1) = even_1 + odd_1; + result()()(2) = even_2 + odd_2; } - } else { - LOAD_CHI(buf); + vstream(out._odata[sF],result); } - { - MULT_ADD(Zp,even); - } - - // Tp - SE=st.GetEntry(ptype,Tp+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT_ADD(Tp,odd); - } - - // Xm - SE=st.GetEntry(ptype,Xm+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT_ADD(Xm,even); - } - - - // Ym - SE=st.GetEntry(ptype,Ym+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT_ADD(Ym,odd); - } - - // Zm - SE=st.GetEntry(ptype,Zm+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT_ADD(Zm,even); - } - - // Tm - SE=st.GetEntry(ptype,Tm+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT_ADD(Tm,odd); - } - - vstream(out()()(0),even_0+odd_0); - vstream(out()()(1),even_1+odd_1); - vstream(out()()(2),even_2+odd_2); - } + +template +void StaggeredKernels::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + Simd even_0; // 12 regs on knc + Simd even_1; + Simd even_2; + Simd odd_0; // 12 regs on knc + Simd odd_1; + Simd odd_2; + + Simd Chi_0; // two spinor; 6 regs + Simd Chi_1; + Simd Chi_2; + + Simd U_00; // two rows of U matrix + Simd U_10; + Simd U_20; + Simd U_01; + Simd U_11; + Simd U_21; // 2 reg left. + Simd U_02; + Simd U_12; + Simd U_22; + + SiteSpinor result; + int offset,local,perm, ptype; + + StencilEntry *SE; + int skew; + + for(int s=0;s +void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + Simd even_0; // 12 regs on knc + Simd even_1; + Simd even_2; + Simd odd_0; // 12 regs on knc + Simd odd_1; + Simd odd_2; + + Simd Chi_0; // two spinor; 6 regs + Simd Chi_1; + Simd Chi_2; + + Simd U_00; // two rows of U matrix + Simd U_10; + Simd U_20; + Simd U_01; + Simd U_11; + Simd U_21; // 2 reg left. + Simd U_02; + Simd U_12; + Simd U_22; + + SiteSpinor result; + int offset,local,perm, ptype; + + StencilEntry *SE; + int skew; + + for(int s=0;s::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \ DoubledGaugeField &U,DoubledGaugeField &UUU, \ - SiteSpinor *buf, int LLs, \ - int sU, const FermionField &in, FermionField &out, int dag); + SiteSpinor *buf, int LLs, int sU, \ + const FermionField &in, FermionField &out, int dag); \ + \ + template void StaggeredKernels::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \ + DoubledGaugeField &U,DoubledGaugeField &UUU, \ + SiteSpinor *buf, int LLs, int sU, \ + const FermionField &in, FermionField &out, int dag); \ + \ + template void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \ + DoubledGaugeField &U,DoubledGaugeField &UUU, \ + SiteSpinor *buf, int LLs, int sU, \ + const FermionField &in, FermionField &out, int dag); \ -#define DHOP_SITE_DEPTH_HAND_INSTANTIATE(IMPL) \ - template void StaggeredKernels::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, \ - SiteSpinor *buf, int sF, \ - int sU, const FermionField &in, SiteSpinor &out,int threeLink) ; DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD); DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF); DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD); DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF); -DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplD); -DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplF); -DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplD); -DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplF); -}} +} +} + diff --git a/lib/qcd/action/fermion/WilsonCompressor.h b/lib/qcd/action/fermion/WilsonCompressor.h index cc5c3c63..3660cc0b 100644 --- a/lib/qcd/action/fermion/WilsonCompressor.h +++ b/lib/qcd/action/fermion/WilsonCompressor.h @@ -267,41 +267,16 @@ public: } typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; - std::vector same_node; - std::vector surface_list; - WilsonStencil(GridBase *grid, int npoints, int checkerboard, const std::vector &directions, const std::vector &distances) - : CartesianStencil (grid,npoints,checkerboard,directions,distances) , - same_node(npoints) + : CartesianStencil (grid,npoints,checkerboard,directions,distances) { ZeroCountersi(); - surface_list.resize(0); }; - void BuildSurfaceList(int Ls,int vol4){ - - // find same node for SHM - // Here we know the distance is 1 for WilsonStencil - for(int point=0;point_npoints;point++){ - same_node[point] = this->SameNode(point); - } - - for(int site = 0 ;site< vol4;site++){ - int local = 1; - for(int point=0;point_npoints;point++){ - if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){ - local = 0; - } - } - if(local == 0) { - surface_list.push_back(site); - } - } - } template < class compressor> void HaloExchangeOpt(const Lattice &source,compressor &compress) @@ -362,23 +337,23 @@ public: int dag = compress.dag; int face_idx=0; if ( dag ) { - assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); - assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); - assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); - assert(same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx)); - assert(same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx)); - assert(same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx)); - assert(same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); - assert(same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx)); + assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); + assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); + assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); + assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx)); + assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx)); + assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx)); + assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); + assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx)); } else { - assert(same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx)); - assert(same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx)); - assert(same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); - assert(same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx)); - assert(same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx)); - assert(same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx)); - assert(same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); - assert(same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx)); + assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx)); + assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx)); + assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); + assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx)); + assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx)); + assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx)); + assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); + assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx)); } this->face_table_computed=1; assert(this->u_comm_offset==this->_unified_buffer_size); diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 1da58ddb..78124beb 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -444,8 +444,7 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg } } ptime = usecond() - start; - } - { + } else { double start = usecond(); st.CommunicateThreaded(); ctime = usecond() - start; diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index 887d8a7c..d1c4ae95 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -149,7 +149,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal std::vector _distances; std::vector _comm_buf_size; std::vector _permute_type; - + std::vector same_node; + std::vector surface_list; + Vector _entries; std::vector Packets; std::vector Mergers; @@ -200,7 +202,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal int dimension = _directions[point]; int displacement = _distances[point]; - assert( (displacement==1) || (displacement==-1)); + int pd = _grid->_processors[dimension]; int fd = _grid->_fdimensions[dimension]; @@ -215,9 +217,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal if ( ! comm_dim ) return 1; int nbr_proc; - if (displacement==1) nbr_proc = 1; - else nbr_proc = pd-1; + if (displacement>0) nbr_proc = 1; + else nbr_proc = pd-1; + // FIXME this logic needs to be sorted for three link term + // assert( (displacement==1) || (displacement==-1)); + // Present hack only works for >= 4^4 subvol per node _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p); @@ -539,6 +544,29 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal } }; + // Move interior/exterior split into the generic stencil + // FIXME Explicit Ls in interface is a pain. Should just use a vol + void BuildSurfaceList(int Ls,int vol4){ + + // find same node for SHM + // Here we know the distance is 1 for WilsonStencil + for(int point=0;point_npoints;point++){ + same_node[point] = this->SameNode(point); + } + + for(int site = 0 ;site< vol4;site++){ + int local = 1; + for(int point=0;point_npoints;point++){ + if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){ + local = 0; + } + } + if(local == 0) { + surface_list.push_back(site); + } + } + } + CartesianStencil(GridBase *grid, int npoints, int checkerboard, @@ -549,7 +577,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal comm_bytes_thr(npoints), comm_enter_thr(npoints), comm_leave_thr(npoints), - comm_time_thr(npoints) + comm_time_thr(npoints), + same_node(npoints) { face_table_computed=0; _npoints = npoints; @@ -557,6 +586,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal _directions = directions; _distances = distances; _unified_buffer_size=0; + surface_list.resize(0); int osites = _grid->oSites(); diff --git a/lib/util/Init.cc b/lib/util/Init.cc index 031f8f5a..13b8f219 100644 --- a/lib/util/Init.cc +++ b/lib/util/Init.cc @@ -360,8 +360,10 @@ void Grid_init(int *argc,char ***argv) } if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){ QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute; + QCD::StaggeredKernelsStatic::Comms = QCD::StaggeredKernelsStatic::CommsAndCompute; } else { QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute; + QCD::StaggeredKernelsStatic::Comms = QCD::StaggeredKernelsStatic::CommsThenCompute; } if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){ CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent); From 63982819c685b852e56fca552896afb78ebe4598 Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Mon, 22 Jan 2018 11:03:39 +0000 Subject: [PATCH 02/26] No option to overlap comms and compute for asm implementation since different directions are interleaved in the kernels, introducing if else structure would be too painful --- lib/qcd/action/fermion/StaggeredKernels.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/lib/qcd/action/fermion/StaggeredKernels.h b/lib/qcd/action/fermion/StaggeredKernels.h index c0c23caa..79de1a68 100644 --- a/lib/qcd/action/fermion/StaggeredKernels.h +++ b/lib/qcd/action/fermion/StaggeredKernels.h @@ -93,14 +93,6 @@ public: DoubledGaugeField &U,DoubledGaugeField &UUU, SiteSpinor * buf, int LLs, int sU, const FermionField &in, FermionField &out,int dag); - void DhopSiteAsmInt(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U,DoubledGaugeField &UUU, - SiteSpinor * buf, int LLs, int sU, - const FermionField &in, FermionField &out,int dag); - void DhopSiteAsmExt(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, DoubledGaugeField &UUU, - SiteSpinor * buf, int LLs, int sU, - const FermionField &in, FermionField &out,int dag); /////////////////////////////////////////////////////////////////////////////////////////////////// // Generic interface; fan out to right routine /////////////////////////////////////////////////////////////////////////////////////////////////// From 97b9c6f03d1a813674cc2e1c52de50ebfac32fdf Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Mon, 22 Jan 2018 11:04:19 +0000 Subject: [PATCH 03/26] No option for interior/exterior split of asm kernels since different directions get interleaved --- lib/qcd/action/fermion/StaggeredKernels.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/qcd/action/fermion/StaggeredKernels.cc b/lib/qcd/action/fermion/StaggeredKernels.cc index ea44cdd8..b7e568c2 100644 --- a/lib/qcd/action/fermion/StaggeredKernels.cc +++ b/lib/qcd/action/fermion/StaggeredKernels.cc @@ -245,10 +245,9 @@ void StaggeredKernels::DhopSite(StencilImpl &st, LebesgueOrder &lo, Double case OptInlineAsm: if ( interior && exterior ) { DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag); - } else if ( interior ) { - DhopSiteAsmInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag); - } else if ( exterior ) { - DhopSiteAsmExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag); + } else { + std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"< Date: Thu, 22 Feb 2018 12:50:09 +0000 Subject: [PATCH 04/26] OverlappedComm for Staggered 5D and 4D. --- lib/algorithms/iterative/ConjugateGradient.h | 8 +- lib/qcd/action/fermion/FermionOperatorImpl.h | 73 +++++----- .../fermion/ImprovedStaggeredFermion.cc | 126 ++++++++++++++++-- .../action/fermion/ImprovedStaggeredFermion.h | 16 ++- .../fermion/ImprovedStaggeredFermion5D.cc | 67 +++++++++- .../fermion/ImprovedStaggeredFermion5D.h | 23 +++- tests/solver/Test_staggered_block_cg_prec.cc | 5 +- .../solver/Test_staggered_block_cg_unprec.cc | 7 +- tests/solver/Test_staggered_cg_prec.cc | 5 +- tests/solver/Test_staggered_cg_schur.cc | 5 +- tests/solver/Test_staggered_cg_unprec.cc | 5 +- 11 files changed, 269 insertions(+), 71 deletions(-) diff --git a/lib/algorithms/iterative/ConjugateGradient.h b/lib/algorithms/iterative/ConjugateGradient.h index 0d4e51c7..e1c796cd 100644 --- a/lib/algorithms/iterative/ConjugateGradient.h +++ b/lib/algorithms/iterative/ConjugateGradient.h @@ -54,6 +54,7 @@ class ConjugateGradient : public OperatorFunction { void operator()(LinearOperatorBase &Linop, const Field &src, Field &psi) { + psi.checkerboard = src.checkerboard; conformable(psi, src); @@ -101,7 +102,7 @@ class ConjugateGradient : public OperatorFunction { SolverTimer.Start(); int k; - for (k = 1; k <= MaxIterations; k++) { + for (k = 1; k <= MaxIterations*1000; k++) { c = cp; MatrixTimer.Start(); @@ -109,8 +110,9 @@ class ConjugateGradient : public OperatorFunction { MatrixTimer.Stop(); LinalgTimer.Start(); - // RealD qqck = norm2(mmp); - // ComplexD dck = innerProduct(p,mmp); + // AA + // RealD qqck = norm2(mmp); + // ComplexD dck = innerProduct(p,mmp); a = c / d; b_pred = a * (a * qq - d) / c; diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h index 9d24deb2..42f222ac 100644 --- a/lib/qcd/action/fermion/FermionOperatorImpl.h +++ b/lib/qcd/action/fermion/FermionOperatorImpl.h @@ -689,7 +689,12 @@ class StaggeredImpl : public PeriodicGaugeImpl(U_ds, U, mu); + } inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &UUUds, // for Naik term DoubledGaugeField &Uds, @@ -728,8 +733,10 @@ class StaggeredImpl : public PeriodicGaugeImpl(Uds, U, mu); - PokeIndex(Uds, Udag, mu + 4); + InsertGaugeField(Uds,U,mu); + InsertGaugeField(Uds,Udag,mu+4); + // PokeIndex(Uds, U, mu); + // PokeIndex(Uds, Udag, mu + 4); // 3 hop based on thin links. Crazy huh ? U = PeekIndex(Uthin, mu); @@ -741,8 +748,8 @@ class StaggeredImpl : public PeriodicGaugeImpl(UUUds, UUU, mu); - PokeIndex(UUUds, UUUdag, mu+4); + InsertGaugeField(UUUds,UUU,mu); + InsertGaugeField(UUUds,UUUdag,mu+4); } } @@ -834,6 +841,23 @@ class StaggeredImpl : public PeriodicGaugeImpllSites(); lidx++) { + + SiteScalarGaugeLink ScalarU; + SiteDoubledGaugeField ScalarUds; + + std::vector lcoor; + GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); + peekLocalSite(ScalarUds, U_ds, lcoor); + + peekLocalSite(ScalarU, U, lcoor); + ScalarUds(mu) = ScalarU(); + + } + } inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &UUUds, // for Naik term DoubledGaugeField &Uds, @@ -875,23 +899,8 @@ class StaggeredImpl : public PeriodicGaugeImpllSites(); lidx++) { - SiteScalarGaugeLink ScalarU; - SiteDoubledGaugeField ScalarUds; - - std::vector lcoor; - GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); - peekLocalSite(ScalarUds, Uds, lcoor); - - peekLocalSite(ScalarU, U, lcoor); - ScalarUds(mu) = ScalarU(); - - peekLocalSite(ScalarU, Udag, lcoor); - ScalarUds(mu + 4) = ScalarU(); - - pokeLocalSite(ScalarUds, Uds, lcoor); - } + InsertGaugeField(Uds,U,mu); + InsertGaugeField(Uds,Udag,mu+4); // 3 hop based on thin links. Crazy huh ? U = PeekIndex(Uthin, mu); @@ -903,24 +912,8 @@ class StaggeredImpl : public PeriodicGaugeImpllSites(); lidx++) { - - SiteScalarGaugeLink ScalarU; - SiteDoubledGaugeField ScalarUds; - - std::vector lcoor; - GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); - - peekLocalSite(ScalarUds, UUUds, lcoor); - - peekLocalSite(ScalarU, UUU, lcoor); - ScalarUds(mu) = ScalarU(); - - peekLocalSite(ScalarU, UUUdag, lcoor); - ScalarUds(mu + 4) = ScalarU(); - - pokeLocalSite(ScalarUds, UUUds, lcoor); - } + InsertGaugeField(UUUds,UUU,mu); + InsertGaugeField(UUUds,UUUdag,mu+4); } } diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc index 5ce2b335..811e482d 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc @@ -44,6 +44,7 @@ ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, template ImprovedStaggeredFermion::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, + RealD _c1, RealD _c2,RealD _u0, const ImplParams &p) : Kernels(p), _grid(&Fgrid), @@ -62,6 +63,16 @@ ImprovedStaggeredFermion::ImprovedStaggeredFermion(GridCartesian &Fgrid, G UUUmuOdd(&Hgrid) , _tmp(&Hgrid) { + int vol4; + int LLs=1; + c1=_c1; + c2=_c2; + u0=_u0; + vol4= _grid->oSites(); + Stencil.BuildSurfaceList(LLs,vol4); + vol4= _cbgrid->oSites(); + StencilEven.BuildSurfaceList(LLs,vol4); + StencilOdd.BuildSurfaceList(LLs,vol4); } template @@ -69,18 +80,15 @@ ImprovedStaggeredFermion::ImprovedStaggeredFermion(GaugeField &_Uthin, Gau GridRedBlackCartesian &Hgrid, RealD _mass, RealD _c1, RealD _c2,RealD _u0, const ImplParams &p) - : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p) + : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_c2,_u0,p) { - c1=_c1; - c2=_c2; - u0=_u0; ImportGauge(_Uthin,_Ufat); } template -ImprovedStaggeredFermion::ImprovedStaggeredFermion(GaugeField &_Uthin,GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid, +ImprovedStaggeredFermion::ImprovedStaggeredFermion(GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, const ImplParams &p) - : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p) + : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,1.0,1.0,1.0,p) { ImportGaugeSimple(_Utriple,_Ufat); } @@ -374,20 +382,116 @@ void ImprovedStaggeredFermion::DhopInternal(StencilImpl &st, LebesgueOrder DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, - FermionField &out, int dag) { + FermionField &out, int dag) +{ +#ifdef GRID_OMP + if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) + DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); + else +#endif + DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); +} +template +void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + DoubledGaugeField &UUU, + const FermionField &in, + FermionField &out, int dag) +{ +#ifdef GRID_OMP + Compressor compressor; + int len = U._grid->oSites(); + const int LLs = 1; + + st.Prepare(); + st.HaloGather(in,compressor); + st.CommsMergeSHM(compressor); + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // Ugly explicit thread mapping introduced for OPA reasons. + ////////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma omp parallel + { + int tid = omp_get_thread_num(); + int nthreads = omp_get_num_threads(); + int ncomms = CartesianCommunicator::nCommThreads; + if (ncomms == -1) ncomms = 1; + assert(nthreads > ncomms); + + if (tid >= ncomms) { + nthreads -= ncomms; + int ttid = tid - ncomms; + int n = len; + int chunk = n / nthreads; + int rem = n % nthreads; + int myblock, myn; + if (ttid < rem) { + myblock = ttid * chunk + ttid; + myn = chunk+1; + } else { + myblock = ttid*chunk + rem; + myn = chunk; + } + + // do the compute + if (dag == DaggerYes) { + for (int ss = myblock; ss < myblock+myn; ++ss) { + int sU = ss; + // Interior = 1; Exterior = 0; must implement for staggered + Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0); + } + } else { + for (int ss = myblock; ss < myblock+myn; ++ss) { + // Interior = 1; Exterior = 0; + int sU = ss; + Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0); + } + } + } else { + st.CommunicateThreaded(); + } + } + + // First to enter, last to leave timing + st.CommsMerge(compressor); + + if (dag == DaggerYes) { + int sz=st.surface_list.size(); + parallel_for (int ss = 0; ss < sz; ss++) { + int sU = st.surface_list[ss]; + Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1); + } + } else { + int sz=st.surface_list.size(); + parallel_for (int ss = 0; ss < sz; ss++) { + int sU = st.surface_list[ss]; + Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1); + } + } +#else + assert(0); +#endif +} + + +template +void ImprovedStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + DoubledGaugeField &UUU, + const FermionField &in, + FermionField &out, int dag) +{ assert((dag == DaggerNo) || (dag == DaggerYes)); Compressor compressor; st.HaloExchange(in, compressor); if (dag == DaggerYes) { - PARALLEL_FOR_LOOP - for (int sss = 0; sss < in._grid->oSites(); sss++) { + parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out); } } else { - PARALLEL_FOR_LOOP - for (int sss = 0; sss < in._grid->oSites(); sss++) { + parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out); } } diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion.h index 7d1f2996..69d0aef4 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.h +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion.h @@ -105,21 +105,31 @@ class ImprovedStaggeredFermion : public StaggeredKernels, public ImprovedS void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag); + void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, + const FermionField &in, FermionField &out, int dag); + void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, + const FermionField &in, FermionField &out, int dag); // Constructor ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, - RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0, + RealD _c1, RealD _c2,RealD _u0, const ImplParams &p = ImplParams()); - ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid, + ////////////////////////////////////////////////////////////////////////// + // MILC constructor no coefficients; premultiply links by desired scaling + ////////////////////////////////////////////////////////////////////////// + ImprovedStaggeredFermion(GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, const ImplParams &p = ImplParams()); + ////////////////////////////////////////////////////////////////////////// + // A don't initialise the gauge field constructor; largely internal + ////////////////////////////////////////////////////////////////////////// ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, + RealD _c1, RealD _c2,RealD _u0, const ImplParams &p = ImplParams()); - // DoubleStore impl dependent void ImportGaugeSimple(const GaugeField &_Utriple, const GaugeField &_Ufat); void ImportGauge(const GaugeField &_Uthin, const GaugeField &_Ufat); diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc index 2df4c044..e5146d7a 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc @@ -41,8 +41,7 @@ ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, // 5d lattice for DWF. template -ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat, - GridCartesian &FiveDimGrid, +ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid, GridRedBlackCartesian &FiveDimRedBlackGrid, GridCartesian &FourDimGrid, GridRedBlackCartesian &FourDimRedBlackGrid, @@ -121,18 +120,43 @@ ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GaugeField &_Uthin, assert(FiveDimGrid._simd_layout[0] ==1); } - - // Allocate the required comms buffer - ImportGauge(_Uthin,_Ufat); - int LLs = FiveDimGrid._rdimensions[0]; int vol4= FourDimGrid.oSites(); Stencil.BuildSurfaceList(LLs,vol4); vol4=FourDimRedBlackGrid.oSites(); StencilEven.BuildSurfaceList(LLs,vol4); - StencilOdd.BuildSurfaceList(LLs,vol4); + StencilOdd.BuildSurfaceList(LLs,vol4); +} +template +ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + RealD _mass, + RealD _c1,RealD _c2, RealD _u0, + const ImplParams &p) : + ImprovedStaggeredFermion5D(FiveDimGrid,FiveDimRedBlackGrid, + FourDimGrid,FourDimRedBlackGrid, + _mass,_c1,_c2,_u0,p) +{ + ImportGauge(_Uthin,_Ufat); +} +template +ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GaugeField &_Utriple,GaugeField &_Ufat, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + RealD _mass, + const ImplParams &p) : + ImprovedStaggeredFermion5D(FiveDimGrid,FiveDimRedBlackGrid, + FourDimGrid,FourDimRedBlackGrid, + _mass,1.0,1.0,1.0,p) +{ + ImportGaugeSimple(_Utriple,_Ufat); } template @@ -140,6 +164,35 @@ void ImprovedStaggeredFermion5D::ImportGauge(const GaugeField &_Uthin) { ImportGauge(_Uthin,_Uthin); }; +/////////////////////////////////////////////////// +// For MILC use; pass three link U's and 1 link U +/////////////////////////////////////////////////// +template +void ImprovedStaggeredFermion5D::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat) +{ + ///////////////////////////////////////////////////////////////// + // Trivial import; phases and fattening and such like preapplied + ///////////////////////////////////////////////////////////////// + for (int mu = 0; mu < Nd; mu++) { + + auto U = PeekIndex(_Utriple, mu); + Impl::InsertGaugeField(UUUmu,U,mu); + + U = adj( Cshift(U, mu, -3)); + Impl::InsertGaugeField(UUUmu,-U,mu+4); + + U = PeekIndex(_Ufat, mu); + Impl::InsertGaugeField(Umu,U,mu); + + U = adj( Cshift(U, mu, -1)); + Impl::InsertGaugeField(Umu,-U,mu+4); + + } + pickCheckerboard(Even, UmuEven, Umu); + pickCheckerboard(Odd, UmuOdd , Umu); + pickCheckerboard(Even, UUUmuEven,UUUmu); + pickCheckerboard(Odd, UUUmuOdd, UUUmu); +} template void ImprovedStaggeredFermion5D::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat) { diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h index e21142b8..f2fce1c1 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -139,6 +139,15 @@ namespace QCD { // Constructors + // -- No Gauge field + ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + double _mass, + RealD _c1, RealD _c2,RealD _u0, + const ImplParams &p= ImplParams()); + // -- Thin link and fat link, with coefficients ImprovedStaggeredFermion5D(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &FiveDimGrid, @@ -146,12 +155,24 @@ namespace QCD { GridCartesian &FourDimGrid, GridRedBlackCartesian &FourDimRedBlackGrid, double _mass, - RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0, + RealD _c1, RealD _c2,RealD _u0, + const ImplParams &p= ImplParams()); + //////////////////////////////////////////////////////////////////////////////////////////////// + // MILC constructor ; triple links, no rescale factors; must be externally pre multiplied + //////////////////////////////////////////////////////////////////////////////////////////////// + ImprovedStaggeredFermion5D(GaugeField &_Utriple, + GaugeField &_Ufat, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + double _mass, const ImplParams &p= ImplParams()); // DoubleStore void ImportGauge(const GaugeField &_U); void ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat); + void ImportGaugeSimple(const GaugeField &_Uthin,const GaugeField &_Ufat); /////////////////////////////////////////////////////////////// // Data members require to support the functionality diff --git a/tests/solver/Test_staggered_block_cg_prec.cc b/tests/solver/Test_staggered_block_cg_prec.cc index 0076e5a0..98a5074e 100644 --- a/tests/solver/Test_staggered_block_cg_prec.cc +++ b/tests/solver/Test_staggered_block_cg_prec.cc @@ -75,7 +75,10 @@ int main (int argc, char ** argv) LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu); RealD mass=0.003; - ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); + RealD c1=9.0/8.0; + RealD c2=-1.0/24.0; + RealD u0=1.0; + ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0); SchurStaggeredOperator HermOp(Ds); ConjugateGradient CG(1.0e-8,10000); diff --git a/tests/solver/Test_staggered_block_cg_unprec.cc b/tests/solver/Test_staggered_block_cg_unprec.cc index 22051ef6..32a1448c 100644 --- a/tests/solver/Test_staggered_block_cg_unprec.cc +++ b/tests/solver/Test_staggered_block_cg_unprec.cc @@ -74,7 +74,10 @@ int main (int argc, char ** argv) LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu); RealD mass=0.003; - ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); + RealD c1=9.0/8.0; + RealD c2=-1.0/24.0; + RealD u0=1.0; + ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0); MdagMLinearOperator HermOp(Ds); ConjugateGradient CG(1.0e-8,10000); @@ -86,7 +89,7 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << "****************************************************************** "< HermOp4d(Ds4d); FermionField src4d(UGrid); random(pRNG,src4d); FermionField result4d(UGrid); result4d=zero; diff --git a/tests/solver/Test_staggered_cg_prec.cc b/tests/solver/Test_staggered_cg_prec.cc index 6bea97c2..167958ed 100644 --- a/tests/solver/Test_staggered_cg_prec.cc +++ b/tests/solver/Test_staggered_cg_prec.cc @@ -71,7 +71,10 @@ int main (int argc, char ** argv) } RealD mass=0.003; - ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass); + RealD c1=9.0/8.0; + RealD c2=-1.0/24.0; + RealD u0=1.0; + ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0); FermionField res_o(&RBGrid); FermionField src_o(&RBGrid); diff --git a/tests/solver/Test_staggered_cg_schur.cc b/tests/solver/Test_staggered_cg_schur.cc index a5c25b85..03a173f9 100644 --- a/tests/solver/Test_staggered_cg_schur.cc +++ b/tests/solver/Test_staggered_cg_schur.cc @@ -65,7 +65,10 @@ int main (int argc, char ** argv) FermionField resid(&Grid); RealD mass=0.1; - ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass); + RealD c1=9.0/8.0; + RealD c2=-1.0/24.0; + RealD u0=1.0; + ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0); ConjugateGradient CG(1.0e-8,10000); SchurRedBlackStaggeredSolve SchurSolver(CG); diff --git a/tests/solver/Test_staggered_cg_unprec.cc b/tests/solver/Test_staggered_cg_unprec.cc index eb33c004..3fae925a 100644 --- a/tests/solver/Test_staggered_cg_unprec.cc +++ b/tests/solver/Test_staggered_cg_unprec.cc @@ -73,7 +73,10 @@ int main (int argc, char ** argv) } RealD mass=0.1; - ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass); + RealD c1=9.0/8.0; + RealD c2=-1.0/24.0; + RealD u0=1.0; + ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0); MdagMLinearOperator HermOp(Ds); ConjugateGradient CG(1.0e-6,10000); From b9382020813d3872d4397da89dec16f68a950be6 Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Wed, 7 Mar 2018 14:08:43 +0000 Subject: [PATCH 05/26] Overlapped Comm for Wilson DhopInternal --- lib/qcd/action/fermion/WilsonFermion.cc | 90 ++++++++++++++++++++++++- lib/qcd/action/fermion/WilsonFermion.h | 6 ++ lib/qcd/action/fermion/WilsonKernels.h | 41 +++++------ 3 files changed, 114 insertions(+), 23 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonFermion.cc b/lib/qcd/action/fermion/WilsonFermion.cc index 19f9674d..a7c69ac9 100644 --- a/lib/qcd/action/fermion/WilsonFermion.cc +++ b/lib/qcd/action/fermion/WilsonFermion.cc @@ -322,15 +322,98 @@ void WilsonFermion::DhopDirDisp(const FermionField &in, FermionField &out, parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma); } -}; - +} +/*Change starts*/ template void WilsonFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag) { - assert((dag == DaggerNo) || (dag == DaggerYes)); +#ifdef GRID_OMP + if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) + DhopInternalOverlappedComms(st,lo,U,in,out,dag); + else +#endif + DhopInternalSerial(st,lo,U,in,out,dag); +} + +template +void WilsonFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, int dag) { + assert((dag == DaggerNo) || (dag == DaggerYes)); +#ifdef GRID_OMP + Compressor compressor; + int len = U._grid->oSites(); + const int LLs = 1; + + st.Prepare(); + st.HaloGather(in,compressor); + st.CommsMergeSHM(compressor); +#pragma omp parallel + { + int tid = omp_get_thread_num(); + int nthreads = omp_get_num_threads(); + int ncomms = CartesianCommunicator::nCommThreads; + if (ncomms == -1) ncomms = 1; + assert(nthreads > ncomms); + if (tid >= ncomms) { + nthreads -= ncomms; + int ttid = tid - ncomms; + int n = len; + int chunk = n / nthreads; + int rem = n % nthreads; + int myblock, myn; + if (ttid < rem) { + myblock = ttid * chunk + ttid; + myn = chunk+1; + } else { + myblock = ttid*chunk + rem; + myn = chunk; + } + // do the compute + if (dag == DaggerYes) { + + for (int sss = myblock; sss < myblock+myn; ++sss) { + Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); + } + } else { + for (int sss = myblock; sss < myblock+myn; ++sss) { + Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); + } + } //else + + } else { + st.CommunicateThreaded(); + } + + Compressor compressor(dag); + + if (dag == DaggerYes) { + parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { + Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); + } + } else { + parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { + Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); + } + } + + } //pragma +#else + assert(0); +#endif +}; + + +template +void WilsonFermion::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, int dag) { + assert((dag == DaggerNo) || (dag == DaggerYes)); Compressor compressor(dag); st.HaloExchange(in, compressor); @@ -344,6 +427,7 @@ void WilsonFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, } } }; +/*Change ends */ FermOpTemplateInstantiate(WilsonFermion); AdjointFermOpTemplateInstantiate(WilsonFermion); diff --git a/lib/qcd/action/fermion/WilsonFermion.h b/lib/qcd/action/fermion/WilsonFermion.h index 933be732..55433854 100644 --- a/lib/qcd/action/fermion/WilsonFermion.h +++ b/lib/qcd/action/fermion/WilsonFermion.h @@ -115,6 +115,12 @@ class WilsonFermion : public WilsonKernels, public WilsonFermionStatic { void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag); + void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag); + + void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag); + // Constructor WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h index 2cf52660..e84b52f9 100644 --- a/lib/qcd/action/fermion/WilsonKernels.h +++ b/lib/qcd/action/fermion/WilsonKernels.h @@ -53,7 +53,7 @@ template class WilsonKernels : public FermionOperator , public typedef FermionOperator Base; public: - + template typename std::enable_if::type DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, @@ -70,27 +70,27 @@ public: break; #endif case OptHandUnroll: - for (int site = 0; site < Ns; site++) { - for (int s = 0; s < Ls; s++) { - if(interior&&exterior) WilsonKernels::HandDhopSite(st,lo,U,buf,sF,sU,in,out); - else if (interior) WilsonKernels::HandDhopSiteInt(st,lo,U,buf,sF,sU,in,out); - else if (exterior) WilsonKernels::HandDhopSiteExt(st,lo,U,buf,sF,sU,in,out); - sF++; - } - sU++; - } + for (int site = 0; site < Ns; site++) { + for (int s = 0; s < Ls; s++) { + if(interior&&exterior) WilsonKernels::HandDhopSite(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::HandDhopSiteInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::HandDhopSiteExt(st,lo,U,buf,sF,sU,in,out); + sF++; + } + sU++; + } break; case OptGeneric: - for (int site = 0; site < Ns; site++) { - for (int s = 0; s < Ls; s++) { - if(interior&&exterior) WilsonKernels::GenericDhopSite(st,lo,U,buf,sF,sU,in,out); - else if (interior) WilsonKernels::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out); - else if (exterior) WilsonKernels::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out); - else assert(0); - sF++; - } - sU++; - } + for (int site = 0; site < Ns; site++) { + for (int s = 0; s < Ls; s++) { + if(interior&&exterior) WilsonKernels::GenericDhopSite(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out); + else assert(0); + sF++; + } + sU++; + } break; default: assert(0); @@ -200,6 +200,7 @@ private: void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, int sF, int sU, const FermionField &in, FermionField &out); + void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); From 80302e95a84ffcbe9e0975241af575febc07c804 Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Thu, 8 Mar 2018 15:34:03 +0000 Subject: [PATCH 06/26] MILC Interface --- .../fermion/ImprovedStaggeredFermion.cc | 33 ++++++------- .../action/fermion/ImprovedStaggeredFermion.h | 25 +++++----- .../fermion/ImprovedStaggeredFermion5D.cc | 48 ++++++++----------- .../fermion/ImprovedStaggeredFermion5D.h | 33 +++++++------ tests/core/Test_staggered5Dvec.cc | 5 +- 5 files changed, 67 insertions(+), 77 deletions(-) diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc index 811e482d..69a3fdc1 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc @@ -84,15 +84,6 @@ ImprovedStaggeredFermion::ImprovedStaggeredFermion(GaugeField &_Uthin, Gau { ImportGauge(_Uthin,_Ufat); } -template -ImprovedStaggeredFermion::ImprovedStaggeredFermion(GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid, - GridRedBlackCartesian &Hgrid, RealD _mass, - const ImplParams &p) - : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,1.0,1.0,1.0,p) -{ - ImportGaugeSimple(_Utriple,_Ufat); -} - //////////////////////////////////////////////////////////// // Momentum space propagator should be @@ -106,11 +97,6 @@ ImprovedStaggeredFermion::ImprovedStaggeredFermion(GaugeField &_Utriple, G // of above link to implmement fourier based solver. //////////////////////////////////////////////////////////// template -void ImprovedStaggeredFermion::ImportGauge(const GaugeField &_Uthin) -{ - ImportGauge(_Uthin,_Uthin); -}; -template void ImprovedStaggeredFermion::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat) { ///////////////////////////////////////////////////////////////// @@ -133,6 +119,20 @@ void ImprovedStaggeredFermion::ImportGaugeSimple(const GaugeField &_Utripl PokeIndex(Umu, -U, mu+4); } + CopyGaugeCheckerboards(); +} +template +void ImprovedStaggeredFermion::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U) +{ + + Umu = _U; + UUUmu = _UUU; + CopyGaugeCheckerboards(); +} + +template +void ImprovedStaggeredFermion::CopyGaugeCheckerboards(void) +{ pickCheckerboard(Even, UmuEven, Umu); pickCheckerboard(Odd, UmuOdd , Umu); pickCheckerboard(Even, UUUmuEven,UUUmu); @@ -168,10 +168,7 @@ void ImprovedStaggeredFermion::ImportGauge(const GaugeField &_Uthin,const PokeIndex(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4); } - pickCheckerboard(Even, UmuEven, Umu); - pickCheckerboard(Odd, UmuOdd , Umu); - pickCheckerboard(Even, UUUmuEven, UUUmu); - pickCheckerboard(Odd, UUUmuOdd, UUUmu); + CopyGaugeCheckerboards(); } ///////////////////////////// diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion.h index 69d0aef4..c2c82073 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.h +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion.h @@ -110,30 +110,29 @@ class ImprovedStaggeredFermion : public StaggeredKernels, public ImprovedS void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag); - // Constructor + ////////////////////////////////////////////////////////////////////////// + // Grid own interface Constructor + ////////////////////////////////////////////////////////////////////////// ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, RealD _c1, RealD _c2,RealD _u0, const ImplParams &p = ImplParams()); ////////////////////////////////////////////////////////////////////////// - // MILC constructor no coefficients; premultiply links by desired scaling - ////////////////////////////////////////////////////////////////////////// - ImprovedStaggeredFermion(GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid, - GridRedBlackCartesian &Hgrid, RealD _mass, - const ImplParams &p = ImplParams()); - - ////////////////////////////////////////////////////////////////////////// - // A don't initialise the gauge field constructor; largely internal + // MILC constructor no gauge fields ////////////////////////////////////////////////////////////////////////// ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, - RealD _c1, RealD _c2,RealD _u0, + RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0, const ImplParams &p = ImplParams()); // DoubleStore impl dependent - void ImportGaugeSimple(const GaugeField &_Utriple, const GaugeField &_Ufat); - void ImportGauge(const GaugeField &_Uthin, const GaugeField &_Ufat); - void ImportGauge(const GaugeField &_Uthin); + void ImportGauge (const GaugeField &_Uthin ) { assert(0); } + void ImportGauge (const GaugeField &_Uthin ,const GaugeField &_Ufat); + void ImportGaugeSimple(const GaugeField &_UUU ,const GaugeField &_U); + void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U); + DoubledGaugeField &GetU(void) { return Umu ; } ; + DoubledGaugeField &GetUUU(void) { return UUUmu; }; + void CopyGaugeCheckerboards(void); /////////////////////////////////////////////////////////////// // Data members require to support the functionality diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc index e5146d7a..df9f055e 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc @@ -128,7 +128,14 @@ ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GridCartesian StencilEven.BuildSurfaceList(LLs,vol4); StencilOdd.BuildSurfaceList(LLs,vol4); } - +template +void ImprovedStaggeredFermion5D::CopyGaugeCheckerboards(void) +{ + pickCheckerboard(Even, UmuEven, Umu); + pickCheckerboard(Odd, UmuOdd , Umu); + pickCheckerboard(Even, UUUmuEven,UUUmu); + pickCheckerboard(Odd, UUUmuOdd, UUUmu); +} template ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat, GridCartesian &FiveDimGrid, @@ -144,26 +151,7 @@ ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GaugeField &_Uthin, { ImportGauge(_Uthin,_Ufat); } -template -ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GaugeField &_Utriple,GaugeField &_Ufat, - GridCartesian &FiveDimGrid, - GridRedBlackCartesian &FiveDimRedBlackGrid, - GridCartesian &FourDimGrid, - GridRedBlackCartesian &FourDimRedBlackGrid, - RealD _mass, - const ImplParams &p) : - ImprovedStaggeredFermion5D(FiveDimGrid,FiveDimRedBlackGrid, - FourDimGrid,FourDimRedBlackGrid, - _mass,1.0,1.0,1.0,p) -{ - ImportGaugeSimple(_Utriple,_Ufat); -} -template -void ImprovedStaggeredFermion5D::ImportGauge(const GaugeField &_Uthin) -{ - ImportGauge(_Uthin,_Uthin); -}; /////////////////////////////////////////////////// // For MILC use; pass three link U's and 1 link U /////////////////////////////////////////////////// @@ -188,10 +176,17 @@ void ImprovedStaggeredFermion5D::ImportGaugeSimple(const GaugeField &_Utri Impl::InsertGaugeField(Umu,-U,mu+4); } - pickCheckerboard(Even, UmuEven, Umu); - pickCheckerboard(Odd, UmuOdd , Umu); - pickCheckerboard(Even, UUUmuEven,UUUmu); - pickCheckerboard(Odd, UUUmuOdd, UUUmu); + CopyGaugeCheckerboards(); +} +template +void ImprovedStaggeredFermion5D::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U) +{ + ///////////////////////////////////////////////////////////////// + // Trivial import; phases and fattening and such like preapplied + ///////////////////////////////////////////////////////////////// + Umu = _U; + UUUmu = _UUU; + CopyGaugeCheckerboards(); } template void ImprovedStaggeredFermion5D::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat) @@ -221,10 +216,7 @@ void ImprovedStaggeredFermion5D::ImportGauge(const GaugeField &_Uthin,cons PokeIndex(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4); } - pickCheckerboard(Even, UmuEven, Umu); - pickCheckerboard(Odd, UmuOdd , Umu); - pickCheckerboard(Even, UUUmuEven, UUUmu); - pickCheckerboard(Odd, UUUmuOdd, UUUmu); + CopyGaugeCheckerboards(); } template void ImprovedStaggeredFermion5D::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp) diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h index f2fce1c1..d9174ec8 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -139,15 +139,9 @@ namespace QCD { // Constructors - // -- No Gauge field - ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid, - GridRedBlackCartesian &FiveDimRedBlackGrid, - GridCartesian &FourDimGrid, - GridRedBlackCartesian &FourDimRedBlackGrid, - double _mass, - RealD _c1, RealD _c2,RealD _u0, - const ImplParams &p= ImplParams()); - // -- Thin link and fat link, with coefficients + //////////////////////////////////////////////////////////////////////////////////////////////// + // Grid internal interface -- Thin link and fat link, with coefficients + //////////////////////////////////////////////////////////////////////////////////////////////// ImprovedStaggeredFermion5D(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &FiveDimGrid, @@ -160,19 +154,24 @@ namespace QCD { //////////////////////////////////////////////////////////////////////////////////////////////// // MILC constructor ; triple links, no rescale factors; must be externally pre multiplied //////////////////////////////////////////////////////////////////////////////////////////////// - ImprovedStaggeredFermion5D(GaugeField &_Utriple, - GaugeField &_Ufat, - GridCartesian &FiveDimGrid, + ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid, GridRedBlackCartesian &FiveDimRedBlackGrid, GridCartesian &FourDimGrid, GridRedBlackCartesian &FourDimRedBlackGrid, double _mass, + RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0, const ImplParams &p= ImplParams()); - - // DoubleStore - void ImportGauge(const GaugeField &_U); - void ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat); - void ImportGaugeSimple(const GaugeField &_Uthin,const GaugeField &_Ufat); + + // DoubleStore gauge field in operator + void ImportGauge (const GaugeField &_Uthin ) { assert(0); } + void ImportGauge (const GaugeField &_Uthin ,const GaugeField &_Ufat); + void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U); + void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U); + // Give a reference; can be used to do an assignment or copy back out after import + // if Carleton wants to cache them and not use the ImportSimple + DoubledGaugeField &GetU(void) { return Umu ; } ; + DoubledGaugeField &GetUUU(void) { return UUUmu; }; + void CopyGaugeCheckerboards(void); /////////////////////////////////////////////////////////////// // Data members require to support the functionality diff --git a/tests/core/Test_staggered5Dvec.cc b/tests/core/Test_staggered5Dvec.cc index db76f792..2720c24c 100644 --- a/tests/core/Test_staggered5Dvec.cc +++ b/tests/core/Test_staggered5Dvec.cc @@ -141,6 +141,7 @@ int main (int argc, char ** argv) t1=usecond(); std::cout<"). + (space-separated strings) in round brackets (i.e. (g_sink g_src)), + in a sequence (e.g. "(Gamma5 Gamma5)(Gamma5 GammaT)"). Special values: "all" - perform all possible contractions. - sink: module to compute the sink to use in contraction (string). diff --git a/tests/hadrons/Test_QED.cc b/tests/hadrons/Test_QED.cc index 3377bf3c..c0bfacc6 100644 --- a/tests/hadrons/Test_QED.cc +++ b/tests/hadrons/Test_QED.cc @@ -189,7 +189,7 @@ int main(int argc, char *argv[]) mesPar.output = "QED/pt_" + flavour[i] + flavour[j]; mesPar.q1 = "Qpt_" + flavour[i]; mesPar.q2 = "Qpt_" + flavour[j]; - mesPar.gammas = ""; + mesPar.gammas = "(Gamma5 Gamma5)"; mesPar.sink = "sink"; application.createModule("meson_pt_" + flavour[i] + flavour[j], @@ -203,7 +203,7 @@ int main(int argc, char *argv[]) + flavour[i] + "__" + flavour[j]; mesPar_seq_T.q1 = "Qpt_" + flavour[i] + "_seq_T" + flavour[i]; mesPar_seq_T.q2 = "Qpt_" + flavour[j]; - mesPar_seq_T.gammas = ""; + mesPar_seq_T.gammas = "(Gamma5 Gamma5)"; mesPar_seq_T.sink = "sink"; application.createModule("meson_tadpole_pt_" + flavour[i] + "_seq_T" @@ -219,7 +219,7 @@ int main(int argc, char *argv[]) + flavour[j]; mesPar_seq_E.q1 = "Qpt_" + flavour[i] + "_seq_V_ph_" + flavour[i]; mesPar_seq_E.q2 = "Qpt_" + flavour[j] + "_seq_V_ph_" + flavour[j]; - mesPar_seq_E.gammas = ""; + mesPar_seq_E.gammas = "(Gamma5 Gamma5)"; mesPar_seq_E.sink = "sink"; application.createModule("meson_exchange_pt_" + flavour[i] + "_seq_V_ph_" + flavour[i] @@ -236,7 +236,7 @@ int main(int argc, char *argv[]) mesPar_seq_S.q1 = "Qpt_" + flavour[i] + "_seq_V_ph_" + flavour[i] + "_seq_V_ph_" + flavour[i]; mesPar_seq_S.q2 = "Qpt_" + flavour[j]; - mesPar_seq_S.gammas = ""; + mesPar_seq_S.gammas = "(Gamma5 Gamma5)"; mesPar_seq_S.sink = "sink"; application.createModule("meson_selfenergy_pt_" + flavour[i] + "_seq_V_ph_" From 213f8db6a2717f123f1c791614fe6689e171bdc2 Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Thu, 26 Apr 2018 10:01:39 +0100 Subject: [PATCH 08/26] Microsecond resultion --- lib/perfmon/Timer.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/perfmon/Timer.h b/lib/perfmon/Timer.h index 392ccc1d..4d32ee52 100644 --- a/lib/perfmon/Timer.h +++ b/lib/perfmon/Timer.h @@ -49,7 +49,8 @@ inline double usecond(void) { typedef std::chrono::system_clock GridClock; typedef std::chrono::time_point GridTimePoint; -typedef std::chrono::milliseconds GridTime; +typedef std::chrono::milliseconds GridMillisecs; +typedef std::chrono::microseconds GridTime; typedef std::chrono::microseconds GridUsecs; inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time) @@ -57,6 +58,11 @@ inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milli stream << time.count()<<" ms"; return stream; } +inline std::ostream& operator<< (std::ostream & stream, const std::chrono::microseconds & time) +{ + stream << time.count()<<" usec"; + return stream; +} class GridStopWatch { private: From eac6ec4b5e1b3b8be4ff877eacbbe9b6d0af5d3d Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Thu, 26 Apr 2018 10:03:57 +0100 Subject: [PATCH 09/26] Faster reductions, important on single node staggered --- lib/lattice/Lattice_arith.h | 12 +----- lib/lattice/Lattice_reduction.h | 72 ++++++++++++++++++++++++++++----- 2 files changed, 64 insertions(+), 20 deletions(-) diff --git a/lib/lattice/Lattice_arith.h b/lib/lattice/Lattice_arith.h index c3093167..d1cbc84a 100644 --- a/lib/lattice/Lattice_arith.h +++ b/lib/lattice/Lattice_arith.h @@ -244,19 +244,11 @@ namespace Grid { template strong_inline RealD axpy_norm(Lattice &ret,sobj a,const Lattice &x,const Lattice &y){ - ret.checkerboard = x.checkerboard; - conformable(ret,x); - conformable(x,y); - axpy(ret,a,x,y); - return norm2(ret); + return axpy_norm_fast(ret,a,x,y); } template strong_inline RealD axpby_norm(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice &y){ - ret.checkerboard = x.checkerboard; - conformable(ret,x); - conformable(x,y); - axpby(ret,a,b,x,y); - return norm2(ret); // FIXME implement parallel norm in ss loop + return axpby_norm_fast(ret,a,b,x,y); } } diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h index 8a3fbece..7e169baf 100644 --- a/lib/lattice/Lattice_reduction.h +++ b/lib/lattice/Lattice_reduction.h @@ -33,7 +33,7 @@ namespace Grid { // Deterministic Reduction operations //////////////////////////////////////////////////////////////////////////////////////////////////// template inline RealD norm2(const Lattice &arg){ - ComplexD nrm = innerProduct(arg,arg); + auto nrm = innerProduct(arg,arg); return std::real(nrm); } @@ -43,12 +43,12 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ { typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_typeD vector_type; - scalar_type nrm; - GridBase *grid = left._grid; - - std::vector > sumarray(grid->SumArraySize()); - + const int pad = 8; + + scalar_type nrm; + std::vector > sumarray(grid->SumArraySize()*pad); + parallel_for(int thr=0;thrSumArraySize();thr++){ int nwork, mywork, myoff; GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); @@ -57,17 +57,69 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ for(int ss=myoff;ssSumArraySize();i++){ - vvnrm = vvnrm+sumarray[i]; + nrm = nrm+sumarray[i*pad]; } - nrm = Reduce(vvnrm);// sum across simd right._grid->GlobalSum(nrm); return nrm; } + +///////////////////////// +// Fast axpby_norm +// z = a x + b y +// return norm z +///////////////////////// +template strong_inline RealD +axpy_norm_fast(Lattice &z,sobj a,const Lattice &x,const Lattice &y) +{ + sobj one(1.0); + return axpby_norm_fast(z,a,one,x,y); +} + +template strong_inline RealD +axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Lattice &y) +{ + const int pad = 8; + z.checkerboard = x.checkerboard; + conformable(z,x); + conformable(x,y); + + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_typeD vector_type; + RealD nrm; + + GridBase *grid = x._grid; + + Vector sumarray(grid->SumArraySize()*pad); + + parallel_for(int thr=0;thrSumArraySize();thr++){ + int nwork, mywork, myoff; + GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff); + + // private to thread; sub summation + decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero; + for(int ss=myoff;ssSumArraySize();i++){ + nrm = nrm+sumarray[i*pad]; + } + z._grid->GlobalSum(nrm); + return nrm; +} + template inline auto sum(const LatticeUnaryExpression & expr) From 3e125c5b6172c83687ac050d7ba5373b4aa5e9ef Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Thu, 26 Apr 2018 10:07:19 +0100 Subject: [PATCH 10/26] Faster linalg on CG optimised against staggered Sum overhead is bigger for staggered --- lib/algorithms/iterative/ConjugateGradient.h | 31 ++++++++++++-------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/lib/algorithms/iterative/ConjugateGradient.h b/lib/algorithms/iterative/ConjugateGradient.h index e1c796cd..ff4ba8ac 100644 --- a/lib/algorithms/iterative/ConjugateGradient.h +++ b/lib/algorithms/iterative/ConjugateGradient.h @@ -70,7 +70,6 @@ class ConjugateGradient : public OperatorFunction { Linop.HermOpAndNorm(psi, mmp, d, b); - r = src - mmp; p = r; @@ -97,6 +96,9 @@ class ConjugateGradient : public OperatorFunction { << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl; GridStopWatch LinalgTimer; + GridStopWatch InnerTimer; + GridStopWatch AxpyNormTimer; + GridStopWatch LinearCombTimer; GridStopWatch MatrixTimer; GridStopWatch SolverTimer; @@ -106,30 +108,32 @@ class ConjugateGradient : public OperatorFunction { c = cp; MatrixTimer.Start(); - Linop.HermOpAndNorm(p, mmp, d, qq); + Linop.HermOp(p, mmp); MatrixTimer.Stop(); LinalgTimer.Start(); - // AA - // RealD qqck = norm2(mmp); - // ComplexD dck = innerProduct(p,mmp); + InnerTimer.Start(); + ComplexD dc = innerProduct(p,mmp); + InnerTimer.Stop(); + d = dc.real(); a = c / d; - b_pred = a * (a * qq - d) / c; + AxpyNormTimer.Start(); cp = axpy_norm(r, -a, mmp, r); + AxpyNormTimer.Stop(); b = cp / c; - // Fuse these loops ; should be really easy - psi = a * p + psi; - p = p * b + r; - + LinearCombTimer.Start(); + parallel_for(int ss=0;ssoSites();ss++){ + vstream(psi[ss], a * p[ss] + psi[ss]); + vstream(p [ss], b * p[ss] + r[ss]); + } + LinearCombTimer.Stop(); LinalgTimer.Stop(); std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k << " residual " << cp << " target " << rsq << std::endl; - std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl; - std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl; // Stopping condition if (cp <= rsq) { @@ -150,6 +154,9 @@ class ConjugateGradient : public OperatorFunction { std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() < Date: Thu, 26 Apr 2018 10:08:05 +0100 Subject: [PATCH 11/26] Improvements to staggered tests timings --- tests/solver/Test_staggered_block_cg_prec.cc | 58 +++++++++++++++++-- .../solver/Test_staggered_block_cg_unprec.cc | 47 ++++++++++++++- tests/solver/Test_staggered_cg_prec.cc | 12 ++++ 3 files changed, 111 insertions(+), 6 deletions(-) diff --git a/tests/solver/Test_staggered_block_cg_prec.cc b/tests/solver/Test_staggered_block_cg_prec.cc index 98a5074e..82052684 100644 --- a/tests/solver/Test_staggered_block_cg_prec.cc +++ b/tests/solver/Test_staggered_block_cg_prec.cc @@ -74,6 +74,11 @@ int main (int argc, char ** argv) LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu); + double volume=1; + for(int mu=0;mu HermOp4d(Ds4d); FermionField src4d(UGrid); random(pRNG,src4d); FermionField src4d_o(UrbGrid); pickCheckerboard(Odd,src4d_o,src4d); FermionField result4d_o(UrbGrid); + double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 + == 1146 result4d_o=zero; - CG(HermOp4d,src4d_o,result4d_o); + { + double t1=usecond(); + CG(HermOp4d,src4d_o,result4d_o); + double t2=usecond(); + double ncall=CG.IterationsToComplete; + double flops = deodoe_flops * ncall; + std::cout< HermOp(Ds); @@ -93,7 +99,19 @@ int main (int argc, char ** argv) MdagMLinearOperator HermOp4d(Ds4d); FermionField src4d(UGrid); random(pRNG,src4d); FermionField result4d(UGrid); result4d=zero; - CG(HermOp4d,src4d,result4d); + + double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 + == 1146 + { + double t1=usecond(); + CG(HermOp4d,src4d,result4d); + double t2=usecond(); + double ncall=CG.IterationsToComplete; + double flops = deodoe_flops * ncall; + std::cout< HermOpEO(Ds); ConjugateGradient CG(1.0e-8,10000); + double t1=usecond(); CG(HermOpEO,src_o,res_o); + double t2=usecond(); + + // Schur solver: uses DeoDoe => volume * 1146 + double ncall=CG.IterationsToComplete; + double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146 + + std::cout<GlobalSum(DhopComputeTime); + DhopComputeTime/=NP; + + RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl; + + RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; + + std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil" < +void ImprovedStaggeredFermion::ZeroCounters(void) +{ + DhopCalls = 0; + DhopTotalTime = 0; + DhopCommTime = 0; + DhopComputeTime = 0; + DhopFaceTime = 0; + + Stencil.ZeroCounters(); + StencilEven.ZeroCounters(); + StencilOdd.ZeroCounters(); +} + + FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion); //AdjointFermOpTemplateInstantiate(ImprovedStaggeredFermion); diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion.h index 69d0aef4..750d29c6 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.h +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion.h @@ -49,6 +49,18 @@ class ImprovedStaggeredFermion : public StaggeredKernels, public ImprovedS FermionField _tmp; FermionField &tmp(void) { return _tmp; } + //////////////////////////////////////// + // Performance monitoring + //////////////////////////////////////// + void Report(void); + void ZeroCounters(void); + double DhopTotalTime; + double DhopCalls; + double DhopCommTime; + double DhopComputeTime; + double DhopComputeTime2; + double DhopFaceTime; + /////////////////////////////////////////////////////////////// // Implement the abstract base /////////////////////////////////////////////////////////////// @@ -142,7 +154,8 @@ class ImprovedStaggeredFermion : public StaggeredKernels, public ImprovedS // protected: public: // any other parameters of action ??? - + virtual int isTrivialEE(void) { return 1; }; + virtual RealD Mass(void) { return mass; } RealD mass; RealD u0; RealD c1; diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc index e5146d7a..ab9c9c48 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc @@ -291,14 +291,12 @@ void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOr DoubledGaugeField & U,DoubledGaugeField & UUU, const FermionField &in, FermionField &out,int dag) { - DhopTotalTime-=usecond(); #ifdef GRID_OMP if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); else #endif DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); - DhopTotalTime+=usecond(); } template @@ -412,6 +410,7 @@ void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, + //double t1=usecond(); DhopTotalTime -= usecond(); DhopCommTime -= usecond(); st.HaloExchange(in,compressor); @@ -432,6 +431,12 @@ void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, } DhopComputeTime += usecond(); DhopTotalTime += usecond(); + //double t2=usecond(); + //std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl; + //std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl; + //std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl; + //std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl; + } /*CHANGE END*/ diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h index f2fce1c1..4024b472 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -178,6 +178,9 @@ namespace QCD { // Data members require to support the functionality /////////////////////////////////////////////////////////////// public: + + virtual int isTrivialEE(void) { return 1; }; + virtual RealD Mass(void) { return mass; } GridBase *_FourDimGrid; GridBase *_FourDimRedBlackGrid; diff --git a/lib/qcd/action/fermion/WilsonFermion.h b/lib/qcd/action/fermion/WilsonFermion.h index 55433854..d0181d68 100644 --- a/lib/qcd/action/fermion/WilsonFermion.h +++ b/lib/qcd/action/fermion/WilsonFermion.h @@ -135,6 +135,8 @@ class WilsonFermion : public WilsonKernels, public WilsonFermionStatic { // protected: public: + virtual RealD Mass(void) { return mass; } + virtual int isTrivialEE(void) { return 1; }; RealD mass; GridBase *_grid; From 1c64ee926edf9472c6a8bcbd961463cab8c0628c Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Thu, 26 Apr 2018 10:17:49 +0100 Subject: [PATCH 14/26] Faster staggered operator with m^2 term trivial used --- lib/algorithms/LinearOperator.h | 65 ++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 21 deletions(-) diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h index 26746e6e..c9410410 100644 --- a/lib/algorithms/LinearOperator.h +++ b/lib/algorithms/LinearOperator.h @@ -51,7 +51,7 @@ namespace Grid { virtual void Op (const Field &in, Field &out) = 0; // Abstract base virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base - virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0; + virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2); virtual void HermOp(const Field &in, Field &out)=0; }; @@ -305,36 +305,59 @@ namespace Grid { class SchurStaggeredOperator : public SchurOperatorBase { protected: Matrix &_Mat; + Field tmp; + RealD mass; + double tMpc; + double tIP; + double tMeo; + double taxpby_norm; + uint64_t ncall; public: - SchurStaggeredOperator (Matrix &Mat): _Mat(Mat){}; + void Report(void) + { + std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "< Date: Thu, 26 Apr 2018 10:33:19 +0100 Subject: [PATCH 15/26] Merge staggered fix linear operator and reduction --- lib/algorithms/LinearOperator.h | 2 +- lib/lattice/Lattice_reduction.h | 19 ++++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h index c9410410..96b1ed1a 100644 --- a/lib/algorithms/LinearOperator.h +++ b/lib/algorithms/LinearOperator.h @@ -51,7 +51,7 @@ namespace Grid { virtual void Op (const Field &in, Field &out) = 0; // Abstract base virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base - virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2); + virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2) = 0; virtual void HermOp(const Field &in, Field &out)=0; }; diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h index 7e169baf..3be4b6cb 100644 --- a/lib/lattice/Lattice_reduction.h +++ b/lib/lattice/Lattice_reduction.h @@ -46,28 +46,29 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ GridBase *grid = left._grid; const int pad = 8; - scalar_type nrm; - std::vector > sumarray(grid->SumArraySize()*pad); + ComplexD inner; + Vector sumarray(grid->SumArraySize()*pad); parallel_for(int thr=0;thrSumArraySize();thr++){ int nwork, mywork, myoff; GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); - decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation + decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation for(int ss=myoff;ssSumArraySize();i++){ - nrm = nrm+sumarray[i*pad]; + inner = inner+sumarray[i*pad]; } - right._grid->GlobalSum(nrm); - return nrm; + right._grid->GlobalSum(inner); + return inner; } ///////////////////////// From 441ad7498dcec9d88ec8d38c405fa8c8de5260af Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Wed, 2 May 2018 14:21:30 +0100 Subject: [PATCH 16/26] add Iterative counter --- lib/algorithms/iterative/ConjugateGradientMultiShift.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/algorithms/iterative/ConjugateGradientMultiShift.h b/lib/algorithms/iterative/ConjugateGradientMultiShift.h index a9ccfd2c..1ad79856 100644 --- a/lib/algorithms/iterative/ConjugateGradientMultiShift.h +++ b/lib/algorithms/iterative/ConjugateGradientMultiShift.h @@ -43,6 +43,7 @@ namespace Grid { public: RealD Tolerance; Integer MaxIterations; + Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion int verbose; MultiShiftFunction shifts; @@ -269,6 +270,9 @@ void operator() (LinearOperatorBase &Linop, const Field &src, std::vector RealD cn = norm2(src); std::cout< Date: Wed, 2 May 2018 14:22:37 +0100 Subject: [PATCH 17/26] MultiShift for Staggered --- tests/solver/Test_staggered_multishift.cc | 121 ++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 tests/solver/Test_staggered_multishift.cc diff --git a/tests/solver/Test_staggered_multishift.cc b/tests/solver/Test_staggered_multishift.cc new file mode 100644 index 00000000..04386027 --- /dev/null +++ b/tests/solver/Test_staggered_multishift.cc @@ -0,0 +1,121 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_wilson_cg_unprec.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + +int main (int argc, char ** argv) +{ + typedef typename ImprovedStaggeredFermionR::FermionField FermionField; + typename ImprovedStaggeredFermionR::ImplParams params; + + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian RBGrid(&Grid); + + std::vector seeds({1,2,3,4}); + GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); + + + LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + + double volume=1; + for(int mu=0;mu HermOpEO(Ds); + + FermionField src(&Grid); random(pRNG,src); + FermionField src_o(&RBGrid); + pickCheckerboard(Odd,src_o,src); + + + ///////////////////////////////// + //Multishift CG + ///////////////////////////////// + std::vector result(degree,&RBGrid); + ConjugateGradientMultiShift MSCG(10000,Sqrt); + + double deodoe_flops=(1205+15*degree)*volume; // == 66*16 + == 1146 + + double t1=usecond(); + MSCG(HermOpEO,src_o,result); + double t2=usecond(); + double ncall=MSCG.IterationsToComplete; + double flops = deodoe_flops * ncall; + std::cout< &Linop, const Field &src, std::vector } } } + AXPYTimer.Stop(); cp=c; - + MatrixTimer.Start(); Linop.HermOpAndNorm(p,mmp,d,qq); + MatrixTimer.Stop(); + + AXPYTimer.Start(); axpy(mmp,mass[0],p,mmp); + AXPYTimer.Stop(); RealD rn = norm2(p); d += rn*mass[0]; bp=b; b=-cp/d; + AXPYTimer.Start(); c=axpy_norm(r,b,mmp,r); + AXPYTimer.Stop(); // Toggle the recurrence history bs[0] = b; iz = 1-iz; + ShiftTimer.Start(); for(int s=1;s &Linop, const Field &src, std::vector bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike } } + ShiftTimer.Stop(); for(int s=0;s &Linop, const Field &src, std::vector // Before: 3 x npole + 3 x npole // After : 2 x npole (ps[s]) => 3x speed up of multishift CG. + AXPYTimer.Start(); if( (!converged[s]) ) { axpy(psi[ss],-bs[s]*alpha[s],ps[s],psi[ss]); } + AXPYTimer.Stop(); } // Convergence checks @@ -258,6 +281,9 @@ void operator() (LinearOperatorBase &Linop, const Field &src, std::vector if ( all_converged ){ + SolverTimer.Stop(); + + std::cout< &Linop, const Field &src, std::vector std::cout< Date: Thu, 3 May 2018 12:31:36 +0100 Subject: [PATCH 19/26] 5D free propagator for DWF and boundary conditions for free propagators --- lib/qcd/action/fermion/DomainWallFermion.h | 55 ++++- lib/qcd/action/fermion/FermionOperator.h | 30 ++- .../fermion/OverlapWilsonCayleyTanhFermion.h | 4 +- lib/qcd/action/fermion/WilsonFermion.cc | 3 +- lib/qcd/action/fermion/WilsonFermion.h | 2 +- lib/qcd/action/fermion/WilsonFermion5D.cc | 227 +++++++++++++++++- lib/qcd/action/fermion/WilsonFermion5D.h | 5 +- tests/core/Test_fft.cc | 3 +- 8 files changed, 311 insertions(+), 18 deletions(-) diff --git a/lib/qcd/action/fermion/DomainWallFermion.h b/lib/qcd/action/fermion/DomainWallFermion.h index 72ce8f42..99f64865 100644 --- a/lib/qcd/action/fermion/DomainWallFermion.h +++ b/lib/qcd/action/fermion/DomainWallFermion.h @@ -8,6 +8,7 @@ Author: Peter Boyle Author: Peter Boyle +Author: Vera Guelpers This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -42,8 +43,58 @@ namespace Grid { INHERIT_IMPL_TYPES(Impl); public: - void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m) { - this->MomentumSpacePropagatorHt(out,in,_m); + void FreePropagator(const FermionField &in,FermionField &out,RealD mass, std::vector twist, bool fiveD) { + FermionField in_k(in._grid); + FermionField prop_k(in._grid); + + FFT theFFT((GridCartesian *) in._grid); + + //phase for boundary condition + ComplexField coor(in._grid); + ComplexField ph(in._grid); ph = zero; + FermionField in_buf(in._grid); in_buf = zero; + Complex ci(0.0,1.0); + assert(twist.size() == Nd);//check that twist is Nd + int shift = 0; + if(fiveD) shift = 1; + for(unsigned int nu = 0; nu < Nd; nu++) + { + // Shift coordinate lattice index by 1 to account for 5th dimension. + LatticeCoordinate(coor, nu + shift); + ph = ph + twist[nu]*coor*((1./(in._grid->_fdimensions[nu+shift]))); + } + in_buf = exp((RealD)(2.0*M_PI)*ci*ph*(-1.0))*in; + + if(fiveD){//FFT only on temporal and spatial dimensions + std::vector mask(Nd+1,1); mask[0] = 0; + theFFT.FFT_dim_mask(in_k,in_buf,mask,FFT::forward); + this->MomentumSpacePropagatorHt_5d(prop_k,in_k,mass,twist); + theFFT.FFT_dim_mask(out,prop_k,mask,FFT::backward); + } + else{ + theFFT.FFT_all_dim(in_k,in,FFT::forward); + this->MomentumSpacePropagatorHt(prop_k,in_k,mass,twist); + theFFT.FFT_all_dim(out,prop_k,FFT::backward); + } + + //phase for boundary condition + out = out * exp((RealD)(2.0*M_PI)*ci*ph); + }; + + virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector twist) { + bool fiveD = true; //5d propagator by default + FreePropagator(in,out,mass,twist,fiveD); + }; + + virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass, bool fiveD) { + std::vector twist(Nd,0.0); //default: periodic boundarys in all directions + FreePropagator(in,out,mass,twist,fiveD); + }; + + virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) { + bool fiveD = true; //5d propagator by default + std::vector twist(Nd,0.0); //default: periodic boundarys in all directions + FreePropagator(in,out,mass,twist,fiveD); }; virtual void Instantiatable(void) {}; diff --git a/lib/qcd/action/fermion/FermionOperator.h b/lib/qcd/action/fermion/FermionOperator.h index f81c9e2c..1ef99b85 100644 --- a/lib/qcd/action/fermion/FermionOperator.h +++ b/lib/qcd/action/fermion/FermionOperator.h @@ -9,6 +9,7 @@ Author: Peter Boyle Author: Peter Boyle Author: Peter Boyle +Author: Vera Guelpers This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -95,17 +96,38 @@ namespace Grid { virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac - virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m) { assert(0);}; + virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector twist) { assert(0);}; - virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) { + virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector twist) { FFT theFFT((GridCartesian *) in._grid); FermionField in_k(in._grid); FermionField prop_k(in._grid); - theFFT.FFT_all_dim(in_k,in,FFT::forward); - this->MomentumSpacePropagator(prop_k,in_k,mass); + //phase for boundary condition + ComplexField coor(in._grid); + ComplexField ph(in._grid); ph = zero; + FermionField in_buf(in._grid); in_buf = zero; + Complex ci(0.0,1.0); + assert(twist.size() == Nd);//check that twist is Nd + for(unsigned int nu = 0; nu < Nd; nu++) + { + LatticeCoordinate(coor, nu); + ph = ph + twist[nu]*coor*((1./(in._grid->_fdimensions[nu]))); + } + in_buf = exp((RealD)(2.0*M_PI)*ci*ph*(-1.0))*in; + + theFFT.FFT_all_dim(in_k,in_buf,FFT::forward); + this->MomentumSpacePropagator(prop_k,in_k,mass,twist); theFFT.FFT_all_dim(out,prop_k,FFT::backward); + + //phase for boundary condition + out = out * exp((RealD)(2.0*M_PI)*ci*ph); + + }; + virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) { + std::vector twist(Nd,0.0); //default: periodic boundarys in all directions + FreePropagator(in,out,mass,twist); }; /////////////////////////////////////////////// diff --git a/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h b/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h index f516c5d0..fd7d74df 100644 --- a/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h +++ b/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h @@ -42,8 +42,8 @@ namespace Grid { INHERIT_IMPL_TYPES(Impl); public: - void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m) { - this->MomentumSpacePropagatorHw(out,in,_m); + void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector twist) { + this->MomentumSpacePropagatorHw(out,in,_m,twist); }; // Constructors diff --git a/lib/qcd/action/fermion/WilsonFermion.cc b/lib/qcd/action/fermion/WilsonFermion.cc index 2d9cf22d..53e81c39 100644 --- a/lib/qcd/action/fermion/WilsonFermion.cc +++ b/lib/qcd/action/fermion/WilsonFermion.cc @@ -162,7 +162,7 @@ void WilsonFermion::MooeeInvDag(const FermionField &in, FermionField &out) MooeeInv(in,out); } template -void WilsonFermion::MomentumSpacePropagator(FermionField &out, const FermionField &in,RealD _m) +void WilsonFermion::MomentumSpacePropagator(FermionField &out, const FermionField &in,RealD _m,std::vector twist) { typedef typename FermionField::vector_type vector_type; typedef typename FermionField::scalar_type ScalComplex; @@ -195,6 +195,7 @@ void WilsonFermion::MomentumSpacePropagator(FermionField &out, const Fermi RealD TwoPiL = M_PI * 2.0/ latt_size[mu]; kmu = TwoPiL * kmu; + kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions wilson = wilson + 2.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term diff --git a/lib/qcd/action/fermion/WilsonFermion.h b/lib/qcd/action/fermion/WilsonFermion.h index ea25ed7f..3985771c 100644 --- a/lib/qcd/action/fermion/WilsonFermion.h +++ b/lib/qcd/action/fermion/WilsonFermion.h @@ -96,7 +96,7 @@ class WilsonFermion : public WilsonKernels, public WilsonFermionStatic { virtual void MooeeInv(const FermionField &in, FermionField &out); virtual void MooeeInvDag(const FermionField &in, FermionField &out); - virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _mass) ; + virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _mass,std::vector twist) ; //////////////////////// // Derivative interface diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 6f82aad2..6d4c6967 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -13,6 +13,7 @@ Author: Peter Boyle Author: paboyle Author: Guido Cossu Author: Andrew Lawson +Author: Vera Guelpers This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -563,7 +564,221 @@ void WilsonFermion5D::DW(const FermionField &in, FermionField &out,int dag } template -void WilsonFermion5D::MomentumSpacePropagatorHt(FermionField &out,const FermionField &in, RealD mass) +void WilsonFermion5D::MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in, RealD mass,std::vector twist) +{ + // what type LatticeComplex + GridBase *_grid = _FourDimGrid; + GridBase *_5dgrid = _FiveDimGrid; + + conformable(_5dgrid,out._grid); + + FermionField PRsource(_5dgrid); + FermionField PLsource(_5dgrid); + FermionField buf1_4d(_grid); + FermionField buf2_4d(_grid); + FermionField GR(_5dgrid); + FermionField GL(_5dgrid); + FermionField bufL_4d(_grid); + FermionField bufR_4d(_grid); + + unsigned int Ls = in._grid->_rdimensions[0]; + + typedef typename FermionField::vector_type vector_type; + typedef typename FermionField::scalar_type ScalComplex; + typedef iSinglet Tcomplex; + typedef Lattice > LatComplex; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + + Gamma g5(Gamma::Algebra::Gamma5); + + std::vector latt_size = _grid->_fdimensions; + + LatComplex sk(_grid); sk = zero; + LatComplex sk2(_grid); sk2= zero; + LatComplex W(_grid); W= zero; + LatComplex a(_grid); a= zero; + LatComplex one (_grid); one = ScalComplex(1.0,0.0); + LatComplex cosha(_grid); + LatComplex kmu(_grid); + LatComplex Wea(_grid); + LatComplex Wema(_grid); + LatComplex sinha(_grid); + LatComplex sinhaLs(_grid); + LatComplex coshaLs(_grid); + LatComplex A(_grid); + LatComplex F(_grid); + LatComplex App(_grid); + LatComplex Amm(_grid); + LatComplex Bpp(_grid); + LatComplex Bmm(_grid); + LatComplex ABpm(_grid); //Apm=Amp=Bpm=Bmp + LatComplex signW(_grid); + + ScalComplex ci(0.0,1.0); + + for(int mu=0;mu alpha + //////////////////////////////////////////// + cosha = (one + W*W + sk) / (abs(W)*2.0); + + // FIXME Need a Lattice acosh + for(int idx=0;idx<_grid->lSites();idx++){ + std::vector lcoor(Nd); + Tcomplex cc; + RealD sgn; + _grid->LocalIndexToLocalCoor(idx,lcoor); + peekLocalSite(cc,cosha,lcoor); + assert((double)real(cc)>=1.0); + assert(fabs((double)imag(cc))<=1.0e-15); + cc = ScalComplex(::acosh(real(cc)),0.0); + pokeLocalSite(cc,a,lcoor); + } + + Wea = ( exp( a) * abs(W) ); + Wema= ( exp(-a) * abs(W) ); + sinha = 0.5*(exp( a) - exp(-a)); + sinhaLs = 0.5*(exp( a*Ls) - exp(-a*Ls)); + coshaLs = 0.5*(exp( a*Ls) + exp(-a*Ls)); + + A = one / (abs(W) * sinha * 2.0) * one / (sinhaLs * 2.0); + F = exp( a*Ls) * (one - Wea + (Wema - one) * mass*mass); + F = F + exp(-a*Ls) * (Wema - one + (one - Wea) * mass*mass); + F = F - abs(W) * sinha * 4.0 * mass; + + Bpp = (A/F) * (exp(-a*Ls*2.0) - one) * (one - Wema) * (one - mass*mass * one); + Bmm = (A/F) * (one - exp(a*Ls*2.0)) * (one - Wea) * (one - mass*mass * one); + App = (A/F) * (exp(-a*Ls*2.0) - one) * exp(-a) * (exp(-a) - abs(W)) * (one - mass*mass * one); + Amm = (A/F) * (one - exp(a*Ls*2.0)) * exp(a) * (exp(a) - abs(W)) * (one - mass*mass * one); + ABpm = (A/F) * abs(W) * sinha * 2.0 * (one + mass * coshaLs * 2.0 + mass*mass * one); + + //P+ source, P- source + PRsource = (in + g5 * in) * 0.5; + PLsource = (in - g5 * in) * 0.5; + + //calculate GR, GL + for(unsigned int ss=1;ss<=Ls;ss++) + { + bufR_4d = zero; + bufL_4d = zero; + for(unsigned int tt=1;tt<=Ls;tt++) + { + //possible sign if W<0 + if((ss+tt)%2==1) signW = abs(W)/W; + else signW = one; + + unsigned int f = (ss > tt) ? ss-tt : tt-ss; //f = abs(ss-tt) + //GR + buf1_4d = zero; + ExtractSlice(buf1_4d, PRsource, (tt-1), 0); + //G(s,t) + bufR_4d = bufR_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf1_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf1_4d; + //A++*exp(a(s+t)) + bufR_4d = bufR_4d + App * exp(a*ss) * exp(a*tt) * signW * buf1_4d ; + //A+-*exp(a(s-t)) + bufR_4d = bufR_4d + ABpm * exp(a*ss) * exp(-a*tt) * signW * buf1_4d ; + //A-+*exp(a(-s+t)) + bufR_4d = bufR_4d + ABpm * exp(-a*ss) * exp(a*tt) * signW * buf1_4d ; + //A--*exp(a(-s-t)) + bufR_4d = bufR_4d + Amm * exp(-a*ss) * exp(-a*tt) * signW * buf1_4d ; + + //GL + buf2_4d = zero; + ExtractSlice(buf2_4d, PLsource, (tt-1), 0); + //G(s,t) + bufL_4d = bufL_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf2_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf2_4d; + //B++*exp(a(s+t)) + bufL_4d = bufL_4d + Bpp * exp(a*ss) * exp(a*tt) * signW * buf2_4d ; + //B+-*exp(a(s-t)) + bufL_4d = bufL_4d + ABpm * exp(a*ss) * exp(-a*tt) * signW * buf2_4d ; + //B-+*exp(a(-s+t)) + bufL_4d = bufL_4d + ABpm * exp(-a*ss) * exp(a*tt) * signW * buf2_4d ; + //B--*exp(a(-s-t)) + bufL_4d = bufL_4d + Bmm * exp(-a*ss) * exp(-a*tt) * signW * buf2_4d ; + } + InsertSlice(bufR_4d, GR, (ss-1), 0); + InsertSlice(bufL_4d, GL, (ss-1), 0); + } + +//calculate propagator + for(unsigned int ss=1;ss<=Ls;ss++) + { + bufR_4d = zero; + bufL_4d = zero; + + //(i*gamma_mu*sin(p_mu) - W)*(GL*P- source) + buf1_4d = zero; + ExtractSlice(buf1_4d, GL, (ss-1), 0); + buf2_4d = zero; + for(int mu=0;mu +void WilsonFermion5D::MomentumSpacePropagatorHt(FermionField &out,const FermionField &in, RealD mass,std::vector twist) { // what type LatticeComplex GridBase *_grid = _FourDimGrid; @@ -606,6 +821,7 @@ void WilsonFermion5D::MomentumSpacePropagatorHt(FermionField &out,const Fe RealD TwoPiL = M_PI * 2.0/ latt_size[mu]; kmu = TwoPiL * kmu; + kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5); sk = sk + sin(kmu) *sin(kmu); @@ -619,7 +835,7 @@ void WilsonFermion5D::MomentumSpacePropagatorHt(FermionField &out,const Fe //////////////////////////////////////////// // Cosh alpha -> alpha //////////////////////////////////////////// - cosha = (one + W*W + sk) / (W*2.0); + cosha = (one + W*W + sk) / (abs(W)*2.0); // FIXME Need a Lattice acosh for(int idx=0;idx<_grid->lSites();idx++){ @@ -634,8 +850,8 @@ void WilsonFermion5D::MomentumSpacePropagatorHt(FermionField &out,const Fe pokeLocalSite(cc,a,lcoor); } - Wea = ( exp( a) * W ); - Wema= ( exp(-a) * W ); + Wea = ( exp( a) * abs(W) ); + Wema= ( exp(-a) * abs(W) ); num = num + ( one - Wema ) * mass * in; denom= ( Wea - one ) + mass*mass * (one - Wema); @@ -643,7 +859,7 @@ void WilsonFermion5D::MomentumSpacePropagatorHt(FermionField &out,const Fe } template -void WilsonFermion5D::MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass) +void WilsonFermion5D::MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector twist) { Gamma::Algebra Gmu [] = { Gamma::Algebra::GammaX, @@ -683,6 +899,7 @@ void WilsonFermion5D::MomentumSpacePropagatorHw(FermionField &out,const Fe RealD TwoPiL = M_PI * 2.0/ latt_size[mu]; kmu = TwoPiL * kmu; + kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5); sk = sk + sin(kmu)*sin(kmu); diff --git a/lib/qcd/action/fermion/WilsonFermion5D.h b/lib/qcd/action/fermion/WilsonFermion5D.h index 21da4c31..d22b5d6f 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.h +++ b/lib/qcd/action/fermion/WilsonFermion5D.h @@ -118,8 +118,9 @@ namespace QCD { virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); - void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass) ; - void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass) ; + void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; + void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; + void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; // Implement hopping term non-hermitian hopping term; half cb or both // Implement s-diagonal DW diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc index b2336cfa..b7f9f000 100644 --- a/tests/core/Test_fft.cc +++ b/tests/core/Test_fft.cc @@ -309,7 +309,8 @@ int main (int argc, char ** argv) // Momentum space prop std::cout << " Solving by FFT and Feynman rules" < Date: Thu, 3 May 2018 12:33:20 +0100 Subject: [PATCH 20/26] FreeProp module for Hadrons --- extras/Hadrons/Modules.hpp | 1 + extras/Hadrons/Modules/MFermion/FreeProp.cc | 36 +++ extras/Hadrons/Modules/MFermion/FreeProp.hpp | 189 ++++++++++++++ extras/Hadrons/modules.inc | 2 + tests/hadrons/Test_free_prop.cc | 245 +++++++++++++++++++ 5 files changed, 473 insertions(+) create mode 100644 extras/Hadrons/Modules/MFermion/FreeProp.cc create mode 100644 extras/Hadrons/Modules/MFermion/FreeProp.hpp create mode 100644 tests/hadrons/Test_free_prop.cc diff --git a/extras/Hadrons/Modules.hpp b/extras/Hadrons/Modules.hpp index fc536393..2c356cfc 100644 --- a/extras/Hadrons/Modules.hpp +++ b/extras/Hadrons/Modules.hpp @@ -37,6 +37,7 @@ See the full license in the file "LICENSE" in the top level distribution directo #include #include #include +#include #include #include #include diff --git a/extras/Hadrons/Modules/MFermion/FreeProp.cc b/extras/Hadrons/Modules/MFermion/FreeProp.cc new file mode 100644 index 00000000..b8eb0529 --- /dev/null +++ b/extras/Hadrons/Modules/MFermion/FreeProp.cc @@ -0,0 +1,36 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: extras/Hadrons/Modules/MFermion/FreeProp.cc + +Copyright (C) 2015-2018 + +Author: Antonin Portelli +Author: Vera Guelpers + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#include + +using namespace Grid; +using namespace Hadrons; +using namespace MFermion; + +template class Grid::Hadrons::MFermion::TFreeProp; + diff --git a/extras/Hadrons/Modules/MFermion/FreeProp.hpp b/extras/Hadrons/Modules/MFermion/FreeProp.hpp new file mode 100644 index 00000000..b1038ddd --- /dev/null +++ b/extras/Hadrons/Modules/MFermion/FreeProp.hpp @@ -0,0 +1,189 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: extras/Hadrons/Modules/MFermion/FreeProp.hpp + +Copyright (C) 2015-2018 + +Author: Vera Guelpers + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + + +#ifndef Hadrons_MFermion_FreeProp_hpp_ +#define Hadrons_MFermion_FreeProp_hpp_ + +#include +#include +#include + +#include + +BEGIN_HADRONS_NAMESPACE + +/****************************************************************************** + * FreeProp * + ******************************************************************************/ +BEGIN_MODULE_NAMESPACE(MFermion) + +class FreePropPar: Serializable +{ +public: + GRID_SERIALIZABLE_CLASS_MEMBERS(FreePropPar, + std::string, source, + std::string, action, + double, mass, + std::string, twist); +}; + +template +class TFreeProp: public Module +{ +public: + FGS_TYPE_ALIASES(FImpl,); +public: + // constructor + TFreeProp(const std::string name); + // destructor + virtual ~TFreeProp(void) {}; + // dependency relation + virtual std::vector getInput(void); + virtual std::vector getOutput(void); +protected: + // setup + virtual void setup(void); + // execution + virtual void execute(void); +private: + unsigned int Ls_; +}; + +MODULE_REGISTER_TMP(FreeProp, TFreeProp, MFermion); + +/****************************************************************************** + * TFreeProp implementation * + ******************************************************************************/ +// constructor ///////////////////////////////////////////////////////////////// +template +TFreeProp::TFreeProp(const std::string name) +: Module(name) +{} + +// dependencies/products /////////////////////////////////////////////////////// +template +std::vector TFreeProp::getInput(void) +{ + std::vector in = {par().source, par().action}; + + return in; +} + +template +std::vector TFreeProp::getOutput(void) +{ + std::vector out = {getName(), getName() + "_5d"}; + + return out; +} + +// setup /////////////////////////////////////////////////////////////////////// +template +void TFreeProp::setup(void) +{ + Ls_ = env().getObjectLs(par().action); + envCreateLat(PropagatorField, getName()); + envTmpLat(FermionField, "source", Ls_); + envTmpLat(FermionField, "sol", Ls_); + envTmpLat(FermionField, "tmp"); + if (Ls_ > 1) + { + envCreateLat(PropagatorField, getName() + "_5d", Ls_); + } +} + +// execution /////////////////////////////////////////////////////////////////// +template +void TFreeProp::execute(void) +{ + LOG(Message) << "Computing free fermion propagator '" << getName() << "'" + << std::endl; + + std::string propName = (Ls_ == 1) ? getName() : (getName() + "_5d"); + auto &prop = envGet(PropagatorField, propName); + auto &fullSrc = envGet(PropagatorField, par().source); + auto &mat = envGet(FMat, par().action); + RealD mass = par().mass; + + envGetTmp(FermionField, source); + envGetTmp(FermionField, sol); + envGetTmp(FermionField, tmp); + LOG(Message) << "Calculating a free Propagator with mass " << mass + << " using the action '" << par().action + << "' on source '" << par().source << "'" << std::endl; + for (unsigned int s = 0; s < Ns; ++s) + for (unsigned int c = 0; c < FImpl::Dimension; ++c) + { + LOG(Message) << "Calculation for spin= " << s << ", color= " << c + << std::endl; + // source conversion for 4D sources + if (!env().isObject5d(par().source)) + { + if (Ls_ == 1) + { + PropToFerm(source, fullSrc, s, c); + } + else + { + PropToFerm(tmp, fullSrc, s, c); + make_5D(tmp, source, Ls_); + } + } + // source conversion for 5D sources + else + { + if (Ls_ != env().getObjectLs(par().source)) + { + HADRONS_ERROR(Size, "Ls mismatch between quark action and source"); + } + else + { + PropToFerm(source, fullSrc, s, c); + } + } + sol = zero; + std::vector twist = strToVec(par().twist); + if(twist.size() != Nd) HADRONS_ERROR(Size, "number of twist angles does not match number of dimensions"); + mat.FreePropagator(source,sol,mass,twist); + FermToProp(prop, sol, s, c); + // create 4D propagators from 5D one if necessary + if (Ls_ > 1) + { + PropagatorField &p4d = envGet(PropagatorField, getName()); + make_4D(sol, tmp, Ls_); + FermToProp(p4d, tmp, s, c); + } + } +} + +END_MODULE_NAMESPACE + +END_HADRONS_NAMESPACE + +#endif // Hadrons_MFermion_FreeProp_hpp_ diff --git a/extras/Hadrons/modules.inc b/extras/Hadrons/modules.inc index ad3a8727..69463f9d 100644 --- a/extras/Hadrons/modules.inc +++ b/extras/Hadrons/modules.inc @@ -7,6 +7,7 @@ modules_cc =\ Modules/MContraction/WardIdentity.cc \ Modules/MContraction/DiscLoop.cc \ Modules/MContraction/Gamma3pt.cc \ + Modules/MFermion/FreeProp.cc \ Modules/MFermion/GaugeProp.cc \ Modules/MSource/Point.cc \ Modules/MSource/Wall.cc \ @@ -54,6 +55,7 @@ modules_hpp =\ Modules/MContraction/Gamma3pt.hpp \ Modules/MContraction/WardIdentity.hpp \ Modules/MContraction/WeakHamiltonianEye.hpp \ + Modules/MFermion/FreeProp.hpp \ Modules/MFermion/GaugeProp.hpp \ Modules/MSource/SeqGamma.hpp \ Modules/MSource/Point.hpp \ diff --git a/tests/hadrons/Test_free_prop.cc b/tests/hadrons/Test_free_prop.cc new file mode 100644 index 00000000..a1a5aadd --- /dev/null +++ b/tests/hadrons/Test_free_prop.cc @@ -0,0 +1,245 @@ +/******************************************************************************* + Grid physics library, www.github.com/paboyle/Grid + + Source file: tests/hadrons/Test_free_prop.cc + + Copyright (C) 2015-2018 + + Author: Antonin Portelli + Author: Vera Guelpers + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution + directory. + *******************************************************************************/ + +#include +#include + +using namespace Grid; +using namespace Hadrons; + +int main(int argc, char *argv[]) +{ + // initialization ////////////////////////////////////////////////////////// + Grid_init(&argc, &argv); + HadronsLogError.Active(GridLogError.isActive()); + HadronsLogWarning.Active(GridLogWarning.isActive()); + HadronsLogMessage.Active(GridLogMessage.isActive()); + HadronsLogIterative.Active(GridLogIterative.isActive()); + HadronsLogDebug.Active(GridLogDebug.isActive()); + LOG(Message) << "Grid initialized" << std::endl; + + + // run setup /////////////////////////////////////////////////////////////// + Application application; + std::vector flavour = {"h"}; //{"l", "s", "c1", "c2", "c3"}; + std::vector mass = {.2}; //{.01, .04, .2 , .25 , .3 }; + std::vector lepton_flavour = {"mu"}; + std::vector lepton_mass = {.2}; + + unsigned int nt = GridDefaultLatt()[Tp]; + + // global parameters + Application::GlobalPar globalPar; + globalPar.trajCounter.start = 1500; + globalPar.trajCounter.end = 1520; + globalPar.trajCounter.step = 20; + globalPar.seed = "1 2 3 4"; + application.setPar(globalPar); + // gauge field + application.createModule("gauge"); + // unit gauge field for lepton + application.createModule("free_gauge"); + // pt source + MSource::Point::Par ptPar; + ptPar.position = "0 0 0 0"; + application.createModule("pt", ptPar); + // sink + MSink::Point::Par sinkPar; + sinkPar.mom = "0 0 0"; + application.createModule("sink", sinkPar); + + // set fermion boundary conditions to be periodic space, antiperiodic time. + std::string boundary = "1 1 1 -1"; + + + //Propagators from FFT and Feynman rules + for (unsigned int i = 0; i < lepton_mass.size(); ++i) + { + //DWF actions + MAction::DWF::Par actionPar_lep; + actionPar_lep.gauge = "free_gauge"; + actionPar_lep.Ls = 8; + actionPar_lep.M5 = 1.8; + actionPar_lep.mass = lepton_mass[i]; + actionPar_lep.boundary = boundary; + application.createModule("free_DWF_" + lepton_flavour[i], actionPar_lep); + + //DWF free propagators + MFermion::FreeProp::Par freePar; + freePar.source = "pt"; + freePar.action = "free_DWF_" + lepton_flavour[i]; + freePar.twist = "0 0 0 0.5"; + freePar.mass = lepton_mass[i]; + application.createModule("Lpt_" + lepton_flavour[i], + freePar); + + //Wilson actions + MAction::Wilson::Par actionPar_lep_W; + actionPar_lep_W.gauge = "free_gauge"; + actionPar_lep_W.mass = lepton_mass[i]; + actionPar_lep_W.boundary = boundary; + application.createModule("free_W_" + lepton_flavour[i], actionPar_lep_W); + + //Wilson free propagators + MFermion::FreeProp::Par freePar_W; + freePar_W.source = "pt"; + freePar_W.action = "free_W_" + lepton_flavour[i]; + freePar_W.twist = "0 0 0 0.5"; + freePar_W.mass = lepton_mass[i]; + application.createModule("W_Lpt_" + lepton_flavour[i], + freePar_W); + + + } + + + + //Propagators from inversion + for (unsigned int i = 0; i < flavour.size(); ++i) + { + //DWF actions + MAction::DWF::Par actionPar; + actionPar.gauge = "gauge"; + actionPar.Ls = 8; + actionPar.M5 = 1.8; + actionPar.mass = mass[i]; + actionPar.boundary = boundary; + application.createModule("DWF_" + flavour[i], actionPar); + + // solvers + MSolver::RBPrecCG::Par solverPar; + solverPar.action = "DWF_" + flavour[i]; + solverPar.residual = 1.0e-8; + solverPar.maxIteration = 10000; + application.createModule("CG_" + flavour[i], + solverPar); + + //DWF propagators + MFermion::GaugeProp::Par quarkPar; + quarkPar.solver = "CG_" + flavour[i]; + quarkPar.source = "pt"; + application.createModule("Qpt_" + flavour[i], + quarkPar); + + + + //Wilson actions + MAction::Wilson::Par actionPar_W; + actionPar_W.gauge = "gauge"; + actionPar_W.mass = mass[i]; + actionPar_W.boundary = boundary; + application.createModule("W_" + flavour[i], actionPar_W); + + + // solvers + MSolver::RBPrecCG::Par solverPar_W; + solverPar_W.action = "W_" + flavour[i]; + solverPar_W.residual = 1.0e-8; + solverPar_W.maxIteration = 10000; + application.createModule("W_CG_" + flavour[i], + solverPar_W); + + //Wilson propagators + MFermion::GaugeProp::Par quarkPar_W; + quarkPar_W.solver = "W_CG_" + flavour[i]; + quarkPar_W.source = "pt"; + application.createModule("W_Qpt_" + flavour[i], + quarkPar_W); + + } + + + //2pt contraction for Propagators from FFT and Feynman rules + for (unsigned int i = 0; i < lepton_flavour.size(); ++i) + for (unsigned int j = i; j < lepton_flavour.size(); ++j) + { + //2pt function contraction DWF + MContraction::Meson::Par freemesPar; + freemesPar.output = "2pt_free/DWF_L_pt_" + lepton_flavour[i] + lepton_flavour[j]; + freemesPar.q1 = "Lpt_" + lepton_flavour[i]; + freemesPar.q2 = "Lpt_" + lepton_flavour[j]; + freemesPar.gammas = "(Gamma5 Gamma5)"; + freemesPar.sink = "sink"; + application.createModule("meson_L_pt_" + + lepton_flavour[i] + lepton_flavour[j], + freemesPar); + + //2pt function contraction Wilson + MContraction::Meson::Par freemesPar_W; + freemesPar_W.output = "2pt_free/W_L_pt_" + lepton_flavour[i] + lepton_flavour[j]; + freemesPar_W.q1 = "W_Lpt_" + lepton_flavour[i]; + freemesPar_W.q2 = "W_Lpt_" + lepton_flavour[j]; + freemesPar_W.gammas = "(Gamma5 Gamma5)"; + freemesPar_W.sink = "sink"; + application.createModule("W_meson_L_pt_" + + lepton_flavour[i] + lepton_flavour[j], + freemesPar_W); + + } + + //2pt contraction for Propagators from inverion + for (unsigned int i = 0; i < flavour.size(); ++i) + for (unsigned int j = i; j < flavour.size(); ++j) + { + //2pt function contraction DWF + MContraction::Meson::Par mesPar; + mesPar.output = "2pt_free/DWF_pt_" + flavour[i] + flavour[j]; + mesPar.q1 = "Qpt_" + flavour[i]; + mesPar.q2 = "Qpt_" + flavour[j]; + mesPar.gammas = "(Gamma5 Gamma5)"; + mesPar.sink = "sink"; + application.createModule("meson_pt_" + + flavour[i] + flavour[j], + mesPar); + + + //2pt function contraction Wilson + MContraction::Meson::Par mesPar_W; + mesPar_W.output = "2pt_free/W_pt_" + flavour[i] + flavour[j]; + mesPar_W.q1 = "W_Qpt_" + flavour[i]; + mesPar_W.q2 = "W_Qpt_" + flavour[j]; + mesPar_W.gammas = "(Gamma5 Gamma5)"; + mesPar_W.sink = "sink"; + application.createModule("W_meson_pt_" + + flavour[i] + flavour[j], + mesPar_W); + + } + + + + // execution + application.saveParameterFile("free_prop.xml"); + application.run(); + + // epilogue + LOG(Message) << "Grid is finalizing now" << std::endl; + Grid_finalize(); + + return EXIT_SUCCESS; +} From 9d9692d4398e78374948af4b939c9b06ba6abc6c Mon Sep 17 00:00:00 2001 From: Vera Guelpers Date: Thu, 3 May 2018 16:40:16 +0100 Subject: [PATCH 21/26] Fix double vs float in boundary phases --- lib/qcd/action/fermion/DomainWallFermion.h | 4 ++-- lib/qcd/action/fermion/FermionOperator.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/qcd/action/fermion/DomainWallFermion.h b/lib/qcd/action/fermion/DomainWallFermion.h index 99f64865..56179f26 100644 --- a/lib/qcd/action/fermion/DomainWallFermion.h +++ b/lib/qcd/action/fermion/DomainWallFermion.h @@ -63,7 +63,7 @@ namespace Grid { LatticeCoordinate(coor, nu + shift); ph = ph + twist[nu]*coor*((1./(in._grid->_fdimensions[nu+shift]))); } - in_buf = exp((RealD)(2.0*M_PI)*ci*ph*(-1.0))*in; + in_buf = exp((Real)(2.0*M_PI)*ci*ph*(-1.0))*in; if(fiveD){//FFT only on temporal and spatial dimensions std::vector mask(Nd+1,1); mask[0] = 0; @@ -78,7 +78,7 @@ namespace Grid { } //phase for boundary condition - out = out * exp((RealD)(2.0*M_PI)*ci*ph); + out = out * exp((Real)(2.0*M_PI)*ci*ph); }; virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector twist) { diff --git a/lib/qcd/action/fermion/FermionOperator.h b/lib/qcd/action/fermion/FermionOperator.h index 1ef99b85..f39e7511 100644 --- a/lib/qcd/action/fermion/FermionOperator.h +++ b/lib/qcd/action/fermion/FermionOperator.h @@ -115,14 +115,14 @@ namespace Grid { LatticeCoordinate(coor, nu); ph = ph + twist[nu]*coor*((1./(in._grid->_fdimensions[nu]))); } - in_buf = exp((RealD)(2.0*M_PI)*ci*ph*(-1.0))*in; + in_buf = exp((Real)(2.0*M_PI)*ci*ph*(-1.0))*in; theFFT.FFT_all_dim(in_k,in_buf,FFT::forward); this->MomentumSpacePropagator(prop_k,in_k,mass,twist); theFFT.FFT_all_dim(out,prop_k,FFT::backward); //phase for boundary condition - out = out * exp((RealD)(2.0*M_PI)*ci*ph); + out = out * exp((Real)(2.0*M_PI)*ci*ph); }; virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) { From 9ada378e38333f36039182cce8f795146c0e8302 Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Fri, 4 May 2018 10:58:01 +0100 Subject: [PATCH 22/26] Add timing --- lib/algorithms/iterative/ConjugateGradientMultiShift.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/algorithms/iterative/ConjugateGradientMultiShift.h b/lib/algorithms/iterative/ConjugateGradientMultiShift.h index 28a80ce6..9d2719fc 100644 --- a/lib/algorithms/iterative/ConjugateGradientMultiShift.h +++ b/lib/algorithms/iterative/ConjugateGradientMultiShift.h @@ -207,7 +207,10 @@ void operator() (LinearOperatorBase &Linop, const Field &src, std::vector cp=c; MatrixTimer.Start(); - Linop.HermOpAndNorm(p,mmp,d,qq); + //Linop.HermOpAndNorm(p,mmp,d,qq); // d is used + Linop.HermOp(p,mmp); + d=real(innerProduct(p,mmp)); + MatrixTimer.Stop(); AXPYTimer.Start(); @@ -253,11 +256,9 @@ void operator() (LinearOperatorBase &Linop, const Field &src, std::vector // Before: 3 x npole + 3 x npole // After : 2 x npole (ps[s]) => 3x speed up of multishift CG. - AXPYTimer.Start(); if( (!converged[s]) ) { axpy(psi[ss],-bs[s]*alpha[s],ps[s],psi[ss]); } - AXPYTimer.Stop(); } // Convergence checks From 68c028b0a6039fe9504ab2c77ab4635e5241dc34 Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Mon, 21 May 2018 12:54:25 +0100 Subject: [PATCH 23/26] Comment --- lib/algorithms/iterative/ConjugateGradientMultiShift.h | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/algorithms/iterative/ConjugateGradientMultiShift.h b/lib/algorithms/iterative/ConjugateGradientMultiShift.h index 9d2719fc..4c311bc1 100644 --- a/lib/algorithms/iterative/ConjugateGradientMultiShift.h +++ b/lib/algorithms/iterative/ConjugateGradientMultiShift.h @@ -208,6 +208,7 @@ void operator() (LinearOperatorBase &Linop, const Field &src, std::vector cp=c; MatrixTimer.Start(); //Linop.HermOpAndNorm(p,mmp,d,qq); // d is used + // The below is faster on KNL Linop.HermOp(p,mmp); d=real(innerProduct(p,mmp)); From 0e127b1fc788c0d343868d99714f8d286b451801 Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Mon, 21 May 2018 12:57:13 +0100 Subject: [PATCH 24/26] New file single prec test --- tests/core/Test_staggered5DvecF.cc | 196 +++++++++++++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 tests/core/Test_staggered5DvecF.cc diff --git a/tests/core/Test_staggered5DvecF.cc b/tests/core/Test_staggered5DvecF.cc new file mode 100644 index 00000000..5d421673 --- /dev/null +++ b/tests/core/Test_staggered5DvecF.cc @@ -0,0 +1,196 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./benchmarks/Benchmark_wilson.cc + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + + const int Ls=16; + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::cout << GridLogMessage << "Making s innermost grids"< seeds({1,2,3,4}); + + GridParallelRNG pRNG4(UGrid); + GridParallelRNG pRNG5(FGrid); + pRNG4.SeedFixedIntegers(seeds); + pRNG5.SeedFixedIntegers(seeds); + + typedef typename ImprovedStaggeredFermion5DF::FermionField FermionField; + typedef typename ImprovedStaggeredFermion5DF::ComplexField ComplexField; + typename ImprovedStaggeredFermion5DF::ImplParams params; + + FermionField src (FGrid); + random(pRNG5,src); + /* + std::vector site({0,1,2,0,0}); + ColourVector cv = zero; + cv()()(0)=1.0; + src = zero; + pokeSite(cv,src,site); + */ + FermionField result(FGrid); result=zero; + FermionField tmp(FGrid); tmp=zero; + FermionField err(FGrid); tmp=zero; + FermionField phi (FGrid); random(pRNG5,phi); + FermionField chi (FGrid); random(pRNG5,chi); + + LatticeGaugeFieldF Umu(UGrid); + SU3::HotConfiguration(pRNG4,Umu); + + /* + for(int mu=1;mu<4;mu++){ + auto tmp = PeekIndex(Umu,mu); + tmp = zero; + PokeIndex(Umu,tmp,mu); + } + */ + double volume=Ls; + for(int mu=0;mu Date: Mon, 28 May 2018 11:39:17 +0200 Subject: [PATCH 25/26] Hadrons: env function to get volume in double --- extras/Hadrons/Environment.cc | 11 +++++------ extras/Hadrons/Environment.hpp | 4 ++-- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/extras/Hadrons/Environment.cc b/extras/Hadrons/Environment.cc index 990a717e..bb12a036 100644 --- a/extras/Hadrons/Environment.cc +++ b/extras/Hadrons/Environment.cc @@ -49,11 +49,10 @@ Environment::Environment(void) dim_, GridDefaultSimd(nd_, vComplex::Nsimd()), GridDefaultMpi())); gridRb4d_.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(grid4d_.get())); - auto loc = getGrid()->LocalDimensions(); - locVol_ = 1; - for (unsigned int d = 0; d < loc.size(); ++d) + vol_ = 1.; + for (auto d: dim_) { - locVol_ *= loc[d]; + vol_ *= d; } rng4d_.reset(new GridParallelRNG(grid4d_.get())); } @@ -190,9 +189,9 @@ int Environment::getDim(const unsigned int mu) const return dim_[mu]; } -unsigned long int Environment::getLocalVolume(void) const +double Environment::getVolume(void) const { - return locVol_; + return vol_; } // random number generator ///////////////////////////////////////////////////// diff --git a/extras/Hadrons/Environment.hpp b/extras/Hadrons/Environment.hpp index a9c3c724..13a7dfe6 100644 --- a/extras/Hadrons/Environment.hpp +++ b/extras/Hadrons/Environment.hpp @@ -94,8 +94,8 @@ public: const unsigned int Ls = 1) const; std::vector getDim(void) const; int getDim(const unsigned int mu) const; - unsigned long int getLocalVolume(void) const; unsigned int getNd(void) const; + double getVolume(void) const; // random number generator void setSeed(const std::vector &seed); GridParallelRNG * get4dRng(void) const; @@ -155,7 +155,7 @@ public: void printContent(void) const; private: // general - unsigned long int locVol_; + double vol_; bool protect_{true}; // grids std::vector dim_; From 84685c9bc3d8cfb32e6536132062eb29603966b9 Mon Sep 17 00:00:00 2001 From: paboyle Date: Mon, 4 Jun 2018 13:42:07 +0100 Subject: [PATCH 26/26] Overflow fix --- lib/lattice/Lattice_rng.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/lib/lattice/Lattice_rng.h b/lib/lattice/Lattice_rng.h index edf9dd23..654c5dd5 100644 --- a/lib/lattice/Lattice_rng.h +++ b/lib/lattice/Lattice_rng.h @@ -158,10 +158,19 @@ namespace Grid { // tens of seconds per trajectory so this is clean in all reasonable cases, // and margin of safety is orders of magnitude. // We could hack Sitmo to skip in the higher order words of state if necessary + // + // Replace with 2^30 ; avoid problem on large volumes + // ///////////////////////////////////////////////////////////////////////////////////// // uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init + const int shift = 30; + uint64_t skip = site; - skip = skip<<40; + + skip = skip<> shift)==site); // check for overflow + eng.discard(skip); // std::cout << " Engine " <