From 006cc8a8f151f36e73f671e98769a1f9bd9374e2 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 28 May 2020 08:33:06 -0400 Subject: [PATCH] Staggereed move to accelerator --- Grid/qcd/action/fermion/Fermion.h | 5 + .../fermion/ImprovedStaggeredFermion5D.h | 4 +- .../action/fermion/NaiveStaggeredFermion.h | 194 +++++++ Grid/qcd/action/fermion/StaggeredKernels.h | 53 +- ...ImprovedStaggeredFermion5DImplementation.h | 126 +---- .../ImprovedStaggeredFermionImplementation.h | 102 +--- .../NaiveStaggeredFermionImplementation.h | 499 ++++++++++++++++++ .../implementation/StaggeredKernelsAsm.h | 43 +- .../implementation/StaggeredKernelsHand.h | 42 +- .../StaggeredKernelsImplementation.h | 222 +++++--- .../WilsonKernelsImplementation.h | 4 + .../NaiveStaggeredFermionInstantiation.cc | 36 ++ ...iveStaggeredFermionInstantiation.cc.master | 37 ++ ...geredFermionInstantiationStaggeredImplD.cc | 1 + ...geredFermionInstantiationStaggeredImplF.cc | 1 + .../instantiation/generate_instantiations.sh | 1 + benchmarks/Benchmark_staggered.cc | 23 - tests/core/Test_staggered_naive.cc | 282 ++++++++++ 18 files changed, 1319 insertions(+), 356 deletions(-) create mode 100644 Grid/qcd/action/fermion/NaiveStaggeredFermion.h create mode 100644 Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h create mode 100644 Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc create mode 100644 Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc.master create mode 120000 Grid/qcd/action/fermion/instantiation/StaggeredImplD/NaiveStaggeredFermionInstantiationStaggeredImplD.cc create mode 120000 Grid/qcd/action/fermion/instantiation/StaggeredImplF/NaiveStaggeredFermionInstantiationStaggeredImplF.cc create mode 100644 tests/core/Test_staggered_naive.cc diff --git a/Grid/qcd/action/fermion/Fermion.h b/Grid/qcd/action/fermion/Fermion.h index af5bebcc..16252340 100644 --- a/Grid/qcd/action/fermion/Fermion.h +++ b/Grid/qcd/action/fermion/Fermion.h @@ -57,6 +57,7 @@ NAMESPACE_CHECK(WilsonClover); #include // 5d base used by all 5d overlap types NAMESPACE_CHECK(Wilson5D); +#include #include #include NAMESPACE_CHECK(Staggered); @@ -282,6 +283,10 @@ typedef ImprovedStaggeredFermion ImprovedStaggeredFermionR; typedef ImprovedStaggeredFermion ImprovedStaggeredFermionF; typedef ImprovedStaggeredFermion ImprovedStaggeredFermionD; +typedef NaiveStaggeredFermion NaiveStaggeredFermionR; +typedef NaiveStaggeredFermion NaiveStaggeredFermionF; +typedef NaiveStaggeredFermion NaiveStaggeredFermionD; + typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DR; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DF; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DD; diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h index 0ce1c701..2578c288 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -62,8 +62,8 @@ public: double DhopCalls; double DhopCommTime; double DhopComputeTime; - double DhopComputeTime2; - double DhopFaceTime; + double DhopComputeTime2; + double DhopFaceTime; /////////////////////////////////////////////////////////////// // Implement the abstract base diff --git a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h new file mode 100644 index 00000000..8715f3c2 --- /dev/null +++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h @@ -0,0 +1,194 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/ImprovedStaggered.h + +Copyright (C) 2015 + +Author: Azusa Yamaguchi, Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ + /* END LEGAL */ +#ifndef GRID_QCD_NAIVE_STAG_FERMION_H +#define GRID_QCD_NAIVE_STAG_FERMION_H + +NAMESPACE_BEGIN(Grid); + +class NaiveStaggeredFermionStatic { +public: + static const std::vector directions; + static const std::vector displacements; + static const int npoint = 8; +}; + +template +class NaiveStaggeredFermion : public StaggeredKernels, public NaiveStaggeredFermionStatic { +public: + INHERIT_IMPL_TYPES(Impl); + typedef StaggeredKernels Kernels; + + FermionField _tmp; + FermionField &tmp(void) { return _tmp; } + + //////////////////////////////////////// + // Performance monitoring + //////////////////////////////////////// + void Report(void); + void ZeroCounters(void); + double DhopTotalTime; + double DhopCalls; + double DhopCommTime; + double DhopComputeTime; + double DhopComputeTime2; + double DhopFaceTime; + + /////////////////////////////////////////////////////////////// + // Implement the abstract base + /////////////////////////////////////////////////////////////// + GridBase *GaugeGrid(void) { return _grid; } + GridBase *GaugeRedBlackGrid(void) { return _cbgrid; } + GridBase *FermionGrid(void) { return _grid; } + GridBase *FermionRedBlackGrid(void) { return _cbgrid; } + + ////////////////////////////////////////////////////////////////// + // override multiply; cut number routines if pass dagger argument + // and also make interface more uniformly consistent + ////////////////////////////////////////////////////////////////// + RealD M(const FermionField &in, FermionField &out); + RealD Mdag(const FermionField &in, FermionField &out); + + ///////////////////////////////////////////////////////// + // half checkerboard operations + ///////////////////////////////////////////////////////// + void Meooe(const FermionField &in, FermionField &out); + void MeooeDag(const FermionField &in, FermionField &out); + void Mooee(const FermionField &in, FermionField &out); + void MooeeDag(const FermionField &in, FermionField &out); + void MooeeInv(const FermionField &in, FermionField &out); + void MooeeInvDag(const FermionField &in, FermionField &out); + + //////////////////////// + // Derivative interface + //////////////////////// + // Interface calls an internal routine + void DhopDeriv (GaugeField &mat, const FermionField &U, const FermionField &V, int dag); + void DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag); + void DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag); + + /////////////////////////////////////////////////////////////// + // non-hermitian hopping term; half cb or both + /////////////////////////////////////////////////////////////// + void Dhop (const FermionField &in, FermionField &out, int dag); + void DhopOE(const FermionField &in, FermionField &out, int dag); + void DhopEO(const FermionField &in, FermionField &out, int dag); + + /////////////////////////////////////////////////////////////// + // Multigrid assistance; force term uses too + /////////////////////////////////////////////////////////////// + void Mdir(const FermionField &in, FermionField &out, int dir, int disp); + void MdirAll(const FermionField &in, std::vector &out); + void DhopDir(const FermionField &in, FermionField &out, int dir, int disp); + + /////////////////////////////////////////////////////////////// + // Extra methods added by derived + /////////////////////////////////////////////////////////////// + void DerivInternal(StencilImpl &st, + DoubledGaugeField &U, + GaugeField &mat, + const FermionField &A, const FermionField &B, int dag); + + void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag); + void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag); + void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag); + + ////////////////////////////////////////////////////////////////////////// + // Grid own interface Constructor + ////////////////////////////////////////////////////////////////////////// + NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid, + GridRedBlackCartesian &Hgrid, RealD _mass, + RealD _c1, RealD _u0, + const ImplParams &p = ImplParams()); + NaiveStaggeredFermion(GridCartesian &Fgrid, + GridRedBlackCartesian &Hgrid, RealD _mass, + RealD _c1, RealD _u0, + const ImplParams &p = ImplParams()); + + // DoubleStore impl dependent + void ImportGauge (const GaugeField &_U ); + DoubledGaugeField &GetU(void) { return Umu ; } ; + void CopyGaugeCheckerboards(void); + + /////////////////////////////////////////////////////////////// + // Data members require to support the functionality + /////////////////////////////////////////////////////////////// + + // protected: +public: + // any other parameters of action ??? + virtual int isTrivialEE(void) { return 1; }; + virtual RealD Mass(void) { return mass; } + RealD mass; + RealD u0; + RealD c1; + + GridBase *_grid; + GridBase *_cbgrid; + + // Defines the stencils for even and odd + StencilImpl Stencil; + StencilImpl StencilEven; + StencilImpl StencilOdd; + + // Copy of the gauge field , with even and odd subsets + DoubledGaugeField Umu; + DoubledGaugeField UmuEven; + DoubledGaugeField UmuOdd; + + LebesgueOrder Lebesgue; + LebesgueOrder LebesgueEvenOdd; + + /////////////////////////////////////////////////////////////// + // Conserved current utilities + /////////////////////////////////////////////////////////////// + void ContractConservedCurrent(PropagatorField &q_in_1, + PropagatorField &q_in_2, + PropagatorField &q_out, + PropagatorField &src, + Current curr_type, + unsigned int mu); + void SeqConservedCurrent(PropagatorField &q_in, + PropagatorField &q_out, + PropagatorField &srct, + Current curr_type, + unsigned int mu, + unsigned int tmin, + unsigned int tmax, + ComplexField &lattice_cmplx); +}; + +typedef NaiveStaggeredFermion NaiveStaggeredFermionF; +typedef NaiveStaggeredFermion NaiveStaggeredFermionD; + +NAMESPACE_END(Grid); + +#endif diff --git a/Grid/qcd/action/fermion/StaggeredKernels.h b/Grid/qcd/action/fermion/StaggeredKernels.h index 6ef0ab9d..0203dca2 100644 --- a/Grid/qcd/action/fermion/StaggeredKernels.h +++ b/Grid/qcd/action/fermion/StaggeredKernels.h @@ -47,23 +47,34 @@ template class StaggeredKernels : public FermionOperator , pub INHERIT_IMPL_TYPES(Impl); typedef FermionOperator Base; -public: - - void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, - int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp); + public: + + void DhopImproved(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + const FermionField &in, FermionField &out, int dag, int interior,int exterior); + void DhopNaive(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag, int interior,int exterior); + + void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, + int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp); + protected: /////////////////////////////////////////////////////////////////////////////////////// // Generic Nc kernels /////////////////////////////////////////////////////////////////////////////////////// - void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, + template + void DhopSiteGeneric(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, + template + void DhopSiteGenericInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, + template + void DhopSiteGenericExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); @@ -71,15 +82,18 @@ public: /////////////////////////////////////////////////////////////////////////////////////// // Nc=3 specific kernels /////////////////////////////////////////////////////////////////////////////////////// - void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, + template + void DhopSiteHand(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, + template + void DhopSiteHandInt(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, + template + void DhopSiteHandExt(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); @@ -87,27 +101,10 @@ public: /////////////////////////////////////////////////////////////////////////////////////// // Asm Nc=3 specific kernels /////////////////////////////////////////////////////////////////////////////////////// - void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, + void DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - /////////////////////////////////////////////////////////////////////////////////////////////////// - // Generic interface; fan out to right routine - /////////////////////////////////////////////////////////////////////////////////////////////////// - void DhopSite(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor * buf, int LLs, int sU, - const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1); - - void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor * buf, int LLs, int sU, - const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1); - - void DhopSite(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor * buf, int LLs, int sU, - const FermionFieldView &in, FermionFieldView &out, int dag, int interior,int exterior); public: diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h index 01d5578f..58d2b368 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h @@ -281,11 +281,9 @@ void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOr DoubledGaugeField & U,DoubledGaugeField & UUU, const FermionField &in, FermionField &out,int dag) { -#ifdef GRID_OMP if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); else -#endif DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); } @@ -294,9 +292,7 @@ void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & DoubledGaugeField & U,DoubledGaugeField & UUU, const FermionField &in, FermionField &out,int dag) { -#ifdef GRID_OMP // assert((dag==DaggerNo) ||(dag==DaggerYes)); - Compressor compressor; int LLs = in.Grid()->_rdimensions[0]; @@ -305,99 +301,42 @@ void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & DhopFaceTime-=usecond(); st.Prepare(); st.HaloGather(in,compressor); + DhopFaceTime+=usecond(); + + DhopCommTime -=usecond(); + std::vector > requests; + st.CommunicateBegin(requests); + // st.HaloExchangeOptGather(in,compressor); // Wilson compressor + DhopFaceTime-=usecond(); st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms DhopFaceTime+=usecond(); - double ctime=0; - double ptime=0; - ////////////////////////////////////////////////////////////////////////////////////////////////////// - // Ugly explicit thread mapping introduced for OPA reasons. + // Remove explicit thread mapping introduced for OPA reasons. ////////////////////////////////////////////////////////////////////////////////////////////////////// -#pragma omp parallel reduction(max:ctime) reduction(max:ptime) + DhopComputeTime-=usecond(); { - int tid = omp_get_thread_num(); - int nthreads = omp_get_num_threads(); - int ncomms = CartesianCommunicator::nCommThreads; - if (ncomms == -1) ncomms = 1; - assert(nthreads > ncomms); - if (tid >= ncomms) { - double start = usecond(); - nthreads -= ncomms; - int ttid = tid - ncomms; - int n = U.Grid()->oSites(); // 4d vol - int chunk = n / nthreads; - int rem = n % nthreads; - int myblock, myn; - if (ttid < rem) { - myblock = ttid * chunk + ttid; - myn = chunk+1; - } else { - myblock = ttid*chunk + rem; - myn = chunk; - } - - // do the compute - auto U_v = U.View(CpuRead); - auto UUU_v = UUU.View(CpuRead); - auto in_v = in.View(CpuRead); - auto out_v = out.View(CpuWrite); - - if (dag == DaggerYes) { - for (int ss = myblock; ss < myblock+myn; ++ss) { - int sU = ss; - // Interior = 1; Exterior = 0; must implement for staggered - Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<--------- - } - } else { - for (int ss = myblock; ss < myblock+myn; ++ss) { - // Interior = 1; Exterior = 0; - int sU = ss; - Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<------------ - } - } - ptime = usecond() - start; - } else { - double start = usecond(); - st.CommunicateThreaded(); - ctime = usecond() - start; - } + int interior=1; + int exterior=0; + Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } - DhopCommTime += ctime; - DhopComputeTime+=ptime; - - // First to enter, last to leave timing - st.CollateThreads(); + DhopComputeTime+=usecond(); DhopFaceTime-=usecond(); st.CommsMerge(compressor); DhopFaceTime+=usecond(); - DhopComputeTime2-=usecond(); + st.CommunicateComplete(requests); + DhopCommTime +=usecond(); - auto U_v = U.View(CpuRead); - auto UUU_v = UUU.View(CpuRead); - auto in_v = in.View(CpuRead); - auto out_v = out.View(CpuWrite); - if (dag == DaggerYes) { - int sz=st.surface_list.size(); - thread_for( ss,sz,{ - int sU = st.surface_list[ss]; - Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1); //<---------- - }); - } else { - int sz=st.surface_list.size(); - thread_for( ss,sz,{ - int sU = st.surface_list[ss]; - Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1);//<---------- - }); + DhopComputeTime2-=usecond(); + { + int interior=0; + int exterior=1; + Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } DhopComputeTime2+=usecond(); -#else - assert(0); -#endif - } template @@ -408,8 +347,6 @@ void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, Compressor compressor; int LLs = in.Grid()->_rdimensions[0]; - - //double t1=usecond(); DhopTotalTime -= usecond(); DhopCommTime -= usecond(); @@ -418,28 +355,13 @@ void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, DhopComputeTime -= usecond(); // Dhop takes the 4d grid from U, and makes a 5d index for fermion - auto U_v = U.View(CpuRead); - auto UUU_v = UUU.View(CpuRead); - auto in_v = in.View(CpuRead); - auto out_v = out.View(CpuWrite); - if (dag == DaggerYes) { - thread_for( ss,U.Grid()->oSites(),{ - int sU=ss; - Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v); - }); - } else { - thread_for( ss,U.Grid()->oSites(),{ - int sU=ss; - Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v); - }); + { + int interior=1; + int exterior=1; + Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } DhopComputeTime += usecond(); DhopTotalTime += usecond(); - //double t2=usecond(); - //std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl; - //std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl; - //std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl; - //std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl; } /*CHANGE END*/ diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h index 1e59c4e7..64554100 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h @@ -395,11 +395,9 @@ void ImprovedStaggeredFermion::DhopInternal(StencilImpl &st, LebesgueOrder const FermionField &in, FermionField &out, int dag) { -#ifdef GRID_OMP if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); else -#endif DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); } template @@ -409,7 +407,6 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st const FermionField &in, FermionField &out, int dag) { -#ifdef GRID_OMP Compressor compressor; int len = U.Grid()->oSites(); @@ -418,60 +415,30 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st DhopFaceTime -= usecond(); st.Prepare(); st.HaloGather(in,compressor); - st.CommsMergeSHM(compressor); DhopFaceTime += usecond(); + DhopCommTime -=usecond(); + std::vector > requests; + st.CommunicateBegin(requests); + + DhopFaceTime-=usecond(); + st.CommsMergeSHM(compressor); + DhopFaceTime+= usecond(); + ////////////////////////////////////////////////////////////////////////////////////////////////////// - // Ugly explicit thread mapping introduced for OPA reasons. + // Removed explicit thread comms ////////////////////////////////////////////////////////////////////////////////////////////////////// DhopComputeTime -= usecond(); -#pragma omp parallel { - int tid = omp_get_thread_num(); - int nthreads = omp_get_num_threads(); - int ncomms = CartesianCommunicator::nCommThreads; - if (ncomms == -1) ncomms = 1; - assert(nthreads > ncomms); - - if (tid >= ncomms) { - nthreads -= ncomms; - int ttid = tid - ncomms; - int n = len; - int chunk = n / nthreads; - int rem = n % nthreads; - int myblock, myn; - if (ttid < rem) { - myblock = ttid * chunk + ttid; - myn = chunk+1; - } else { - myblock = ttid*chunk + rem; - myn = chunk; - } - - // do the compute - auto U_v = U.View(CpuRead); - auto UUU_v = UUU.View(CpuRead); - auto in_v = in.View(CpuRead); - auto out_v = out.View(CpuWrite); - if (dag == DaggerYes) { - for (int ss = myblock; ss < myblock+myn; ++ss) { - int sU = ss; - // Interior = 1; Exterior = 0; must implement for staggered - Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0); - } - } else { - for (int ss = myblock; ss < myblock+myn; ++ss) { - // Interior = 1; Exterior = 0; - int sU = ss; - Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0); - } - } - } else { - st.CommunicateThreaded(); - } + int interior=1; + int exterior=0; + Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } DhopComputeTime += usecond(); + st.CommunicateComplete(requests); + DhopCommTime +=usecond(); + // First to enter, last to leave timing DhopFaceTime -= usecond(); st.CommsMerge(compressor); @@ -479,28 +446,11 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st DhopComputeTime2 -= usecond(); { - auto U_v = U.View(CpuRead); - auto UUU_v = UUU.View(CpuRead); - auto in_v = in.View(CpuRead); - auto out_v = out.View(CpuWrite); - if (dag == DaggerYes) { - int sz=st.surface_list.size(); - thread_for(ss,sz,{ - int sU = st.surface_list[ss]; - Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1); - }); - } else { - int sz=st.surface_list.size(); - thread_for(ss,sz,{ - int sU = st.surface_list[ss]; - Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1); - }); - } + int interior=0; + int exterior=1; + Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } DhopComputeTime2 += usecond(); -#else - assert(0); -#endif } @@ -520,19 +470,11 @@ void ImprovedStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, Le st.HaloExchange(in, compressor); DhopCommTime += usecond(); - auto U_v = U.View(CpuRead); - auto UUU_v = UUU.View(CpuRead); - auto in_v = in.View(CpuRead); - auto out_v = out.View(CpuWrite); DhopComputeTime -= usecond(); - if (dag == DaggerYes) { - thread_for(sss, in.Grid()->oSites(),{ - Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v); - }); - } else { - thread_for(sss, in.Grid()->oSites(),{ - Kernels::DhopSite(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v); - }); + { + int interior=1; + int exterior=1; + Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } DhopComputeTime += usecond(); DhopTotalTime += usecond(); diff --git a/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h new file mode 100644 index 00000000..ccd36f57 --- /dev/null +++ b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h @@ -0,0 +1,499 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc + +Copyright (C) 2015 + +Author: Azusa Yamaguchi, Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include + +#pragma once + +NAMESPACE_BEGIN(Grid); + +///////////////////////////////// +// Constructor and gauge import +///////////////////////////////// + +template +NaiveStaggeredFermion::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, + RealD _mass, + RealD _c1, RealD _u0, + const ImplParams &p) + : Kernels(p), + _grid(&Fgrid), + _cbgrid(&Hgrid), + Stencil(&Fgrid, npoint, Even, directions, displacements,p), + StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even + StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd + mass(_mass), + Lebesgue(_grid), + LebesgueEvenOdd(_cbgrid), + Umu(&Fgrid), + UmuEven(&Hgrid), + UmuOdd(&Hgrid), + _tmp(&Hgrid) +{ + int vol4; + int LLs=1; + c1=_c1; + u0=_u0; + vol4= _grid->oSites(); + Stencil.BuildSurfaceList(LLs,vol4); + vol4= _cbgrid->oSites(); + StencilEven.BuildSurfaceList(LLs,vol4); + StencilOdd.BuildSurfaceList(LLs,vol4); +} + +template +NaiveStaggeredFermion::NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid, + GridRedBlackCartesian &Hgrid, RealD _mass, + RealD _c1, RealD _u0, + const ImplParams &p) + : NaiveStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_u0,p) +{ + ImportGauge(_U); +} + +//////////////////////////////////////////////////////////// +// Momentum space propagator should be +// https://arxiv.org/pdf/hep-lat/9712010.pdf +// +// mom space action. +// gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m +// +// must track through staggered flavour/spin reduction in literature to +// turn to free propagator for the one component chi field, a la page 4/5 +// of above link to implmement fourier based solver. +//////////////////////////////////////////////////////////// + +template +void NaiveStaggeredFermion::CopyGaugeCheckerboards(void) +{ + pickCheckerboard(Even, UmuEven, Umu); + pickCheckerboard(Odd, UmuOdd , Umu); +} +template +void NaiveStaggeredFermion::ImportGauge(const GaugeField &_U) +{ + GaugeLinkField U(GaugeGrid()); + DoubledGaugeField _UUU(GaugeGrid()); + //////////////////////////////////////////////////////// + // Double Store should take two fields for Naik and one hop separately. + // Discard teh Naik as Naive + //////////////////////////////////////////////////////// + Impl::DoubleStore(GaugeGrid(), _UUU, Umu, _U, _U ); + + //////////////////////////////////////////////////////// + // Apply scale factors to get the right fermion Kinetic term + // Could pass coeffs into the double store to save work. + // 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) ) + //////////////////////////////////////////////////////// + for (int mu = 0; mu < Nd; mu++) { + + U = PeekIndex(Umu, mu); + PokeIndex(Umu, U*( 0.5*c1/u0), mu ); + + U = PeekIndex(Umu, mu+4); + PokeIndex(Umu, U*(-0.5*c1/u0), mu+4); + + } + + CopyGaugeCheckerboards(); +} + +///////////////////////////// +// Implement the interface +///////////////////////////// + +template +RealD NaiveStaggeredFermion::M(const FermionField &in, FermionField &out) { + out.Checkerboard() = in.Checkerboard(); + Dhop(in, out, DaggerNo); + return axpy_norm(out, mass, in, out); +} + +template +RealD NaiveStaggeredFermion::Mdag(const FermionField &in, FermionField &out) { + out.Checkerboard() = in.Checkerboard(); + Dhop(in, out, DaggerYes); + return axpy_norm(out, mass, in, out); +} + +template +void NaiveStaggeredFermion::Meooe(const FermionField &in, FermionField &out) { + if (in.Checkerboard() == Odd) { + DhopEO(in, out, DaggerNo); + } else { + DhopOE(in, out, DaggerNo); + } +} +template +void NaiveStaggeredFermion::MeooeDag(const FermionField &in, FermionField &out) { + if (in.Checkerboard() == Odd) { + DhopEO(in, out, DaggerYes); + } else { + DhopOE(in, out, DaggerYes); + } +} + +template +void NaiveStaggeredFermion::Mooee(const FermionField &in, FermionField &out) { + out.Checkerboard() = in.Checkerboard(); + typename FermionField::scalar_type scal(mass); + out = scal * in; +} + +template +void NaiveStaggeredFermion::MooeeDag(const FermionField &in, FermionField &out) { + out.Checkerboard() = in.Checkerboard(); + Mooee(in, out); +} + +template +void NaiveStaggeredFermion::MooeeInv(const FermionField &in, FermionField &out) { + out.Checkerboard() = in.Checkerboard(); + out = (1.0 / (mass)) * in; +} + +template +void NaiveStaggeredFermion::MooeeInvDag(const FermionField &in, FermionField &out) +{ + out.Checkerboard() = in.Checkerboard(); + MooeeInv(in, out); +} + +/////////////////////////////////// +// Internal +/////////////////////////////////// + +template +void NaiveStaggeredFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, + GaugeField & mat, + const FermionField &A, const FermionField &B, int dag) +{ + assert((dag == DaggerNo) || (dag == DaggerYes)); + + Compressor compressor; + + FermionField Btilde(B.Grid()); + FermionField Atilde(B.Grid()); + Atilde = A; + + st.HaloExchange(B, compressor); + + for (int mu = 0; mu < Nd; mu++) { + + //////////////////////// + // Call the single hop + //////////////////////// + auto U_v = U.View(CpuRead); + auto B_v = B.View(CpuWrite); + auto Btilde_v = Btilde.View(CpuWrite); + thread_for(sss,B.Grid()->oSites(),{ + Kernels::DhopDirKernel(st, U_v, U_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1); + }); + + assert(0);// need to figure out the force interface with a blasted three link term. + + } +} + +template +void NaiveStaggeredFermion::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { + + conformable(U.Grid(), _grid); + conformable(U.Grid(), V.Grid()); + conformable(U.Grid(), mat.Grid()); + + mat.Checkerboard() = U.Checkerboard(); + + DerivInternal(Stencil, Umu, mat, U, V, dag); +} + +template +void NaiveStaggeredFermion::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { + + conformable(U.Grid(), _cbgrid); + conformable(U.Grid(), V.Grid()); + conformable(U.Grid(), mat.Grid()); + + assert(V.Checkerboard() == Even); + assert(U.Checkerboard() == Odd); + mat.Checkerboard() = Odd; + + DerivInternal(StencilEven, UmuOdd, mat, U, V, dag); +} + +template +void NaiveStaggeredFermion::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { + + conformable(U.Grid(), _cbgrid); + conformable(U.Grid(), V.Grid()); + conformable(U.Grid(), mat.Grid()); + + assert(V.Checkerboard() == Odd); + assert(U.Checkerboard() == Even); + mat.Checkerboard() = Even; + + DerivInternal(StencilOdd, UmuEven, mat, U, V, dag); +} + +template +void NaiveStaggeredFermion::Dhop(const FermionField &in, FermionField &out, int dag) +{ + DhopCalls+=2; + conformable(in.Grid(), _grid); // verifies full grid + conformable(in.Grid(), out.Grid()); + + out.Checkerboard() = in.Checkerboard(); + + DhopInternal(Stencil, Lebesgue, Umu, in, out, dag); +} + +template +void NaiveStaggeredFermion::DhopOE(const FermionField &in, FermionField &out, int dag) +{ + DhopCalls+=1; + conformable(in.Grid(), _cbgrid); // verifies half grid + conformable(in.Grid(), out.Grid()); // drops the cb check + + assert(in.Checkerboard() == Even); + out.Checkerboard() = Odd; + + DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag); +} + +template +void NaiveStaggeredFermion::DhopEO(const FermionField &in, FermionField &out, int dag) +{ + DhopCalls+=1; + conformable(in.Grid(), _cbgrid); // verifies half grid + conformable(in.Grid(), out.Grid()); // drops the cb check + + assert(in.Checkerboard() == Odd); + out.Checkerboard() = Even; + + DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag); +} + +template +void NaiveStaggeredFermion::Mdir(const FermionField &in, FermionField &out, int dir, int disp) +{ + DhopDir(in, out, dir, disp); +} +template +void NaiveStaggeredFermion::MdirAll(const FermionField &in, std::vector &out) +{ + assert(0); // Not implemented yet +} + +template +void NaiveStaggeredFermion::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) +{ + + Compressor compressor; + Stencil.HaloExchange(in, compressor); + auto Umu_v = Umu.View(CpuRead); + auto in_v = in.View(CpuRead); + auto out_v = out.View(CpuWrite); + // thread_for( sss, in.Grid()->oSites(),{ + // Kernels::DhopDirKernel(Stencil, Umu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp); + // }); + assert(0); +}; + + +template +void NaiveStaggeredFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, int dag) +{ + if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) + DhopInternalOverlappedComms(st,lo,U,in,out,dag); + else + DhopInternalSerialComms(st,lo,U,in,out,dag); +} +template +void NaiveStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, int dag) +{ + Compressor compressor; + int len = U.Grid()->oSites(); + + DhopTotalTime -= usecond(); + + DhopFaceTime -= usecond(); + st.Prepare(); + st.HaloGather(in,compressor); + DhopFaceTime += usecond(); + + DhopCommTime -=usecond(); + std::vector > requests; + st.CommunicateBegin(requests); + + DhopFaceTime-=usecond(); + st.CommsMergeSHM(compressor); + DhopFaceTime+= usecond(); + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // Removed explicit thread comms + ////////////////////////////////////////////////////////////////////////////////////////////////////// + DhopComputeTime -= usecond(); + { + int interior=1; + int exterior=0; + Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); + } + DhopComputeTime += usecond(); + + st.CommunicateComplete(requests); + DhopCommTime +=usecond(); + + // First to enter, last to leave timing + DhopFaceTime -= usecond(); + st.CommsMerge(compressor); + DhopFaceTime -= usecond(); + + DhopComputeTime2 -= usecond(); + { + int interior=0; + int exterior=1; + Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); + } + DhopComputeTime2 += usecond(); +} + +template +void NaiveStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, int dag) +{ + assert((dag == DaggerNo) || (dag == DaggerYes)); + + DhopTotalTime -= usecond(); + + DhopCommTime -= usecond(); + Compressor compressor; + st.HaloExchange(in, compressor); + DhopCommTime += usecond(); + + DhopComputeTime -= usecond(); + { + int interior=1; + int exterior=1; + Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); + } + DhopComputeTime += usecond(); + DhopTotalTime += usecond(); +}; + + //////////////////////////////////////////////////////////////// + // Reporting + //////////////////////////////////////////////////////////////// +template +void NaiveStaggeredFermion::Report(void) +{ + Coordinate latt = _grid->GlobalDimensions(); + RealD volume = 1; for(int mu=0;mu_Nprocessors; + RealD NN = _grid->NodeCount(); + + std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; + + std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls : " + << DhopCalls << std::endl; + std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime /Calls : " + << DhopTotalTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime /Calls : " + << DhopCommTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls : " + << DhopComputeTime / DhopCalls << " us" << std::endl; + + // Average the compute time + _grid->GlobalSum(DhopComputeTime); + DhopComputeTime/=NP; + + RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl; + + RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; + + std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil" < +void NaiveStaggeredFermion::ZeroCounters(void) +{ + DhopCalls = 0; + DhopTotalTime = 0; + DhopCommTime = 0; + DhopComputeTime = 0; + DhopFaceTime = 0; + + Stencil.ZeroCounters(); + StencilEven.ZeroCounters(); + StencilOdd.ZeroCounters(); +} + + +//////////////////////////////////////////////////////// +// Conserved current - not yet implemented. +//////////////////////////////////////////////////////// +template +void NaiveStaggeredFermion::ContractConservedCurrent(PropagatorField &q_in_1, + PropagatorField &q_in_2, + PropagatorField &q_out, + PropagatorField &src, + Current curr_type, + unsigned int mu) +{ + assert(0); +} + +template +void NaiveStaggeredFermion::SeqConservedCurrent(PropagatorField &q_in, + PropagatorField &q_out, + PropagatorField &src, + Current curr_type, + unsigned int mu, + unsigned int tmin, + unsigned int tmax, + ComplexField &lattice_cmplx) +{ + assert(0); + +} + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h index 1a13e73a..63fd2a2f 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h @@ -618,10 +618,10 @@ Author: paboyle NAMESPACE_BEGIN(Grid); template -void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, +void StaggeredKernels::DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { assert(0); @@ -680,12 +680,13 @@ void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, gauge2 =(uint64_t)&UU[sU]( Z ); \ gauge3 =(uint64_t)&UU[sU]( T ); + // This is the single precision 5th direction vectorised kernel #include -template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, +template <> void StaggeredKernels::DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { #ifdef AVX512 @@ -702,9 +703,10 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl StencilEntry *SE2; StencilEntry *SE3; - for(int s=0;s void StaggeredKernels::DhopSiteAsm(StencilImpl } #include -template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, +template <> void StaggeredKernels::DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dag) { #ifdef AVX512 @@ -756,8 +758,9 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl StencilEntry *SE2; StencilEntry *SE3; - for(int s=0;s void StaggeredKernels::DhopSiteAsm(StencilImpl // This is the single precision 5th direction vectorised kernel #include -template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, +template <> void StaggeredKernels::DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { #ifdef AVX512 @@ -841,9 +844,9 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, StencilEntry *SE2; StencilEntry *SE3; - for(int s=0;s void StaggeredKernels::DhopSiteAsm(StencilImpl &st, } #include -template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, +template <> void StaggeredKernels::DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { #ifdef AVX512 @@ -910,9 +913,9 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, StencilEntry *SE2; StencilEntry *SE3; - for(int s=0;s -void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, +template +void StaggeredKernels::DhopSiteHand(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { typedef typename Simd::scalar_type S; @@ -181,8 +182,9 @@ void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, StencilEntry *SE; int skew; - for(int s=0;s::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, HAND_STENCIL_LEG (U,Ym,2,skew,odd); HAND_STENCIL_LEG (U,Zm,1,skew,even); HAND_STENCIL_LEG (U,Tm,0,skew,odd); + if (Naik) { skew = 8; HAND_STENCIL_LEG(UUU,Xp,3,skew,even); HAND_STENCIL_LEG(UUU,Yp,2,skew,odd); @@ -202,7 +205,7 @@ void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, HAND_STENCIL_LEG(UUU,Ym,2,skew,odd); HAND_STENCIL_LEG(UUU,Zm,1,skew,even); HAND_STENCIL_LEG(UUU,Tm,0,skew,odd); - + } if ( dag ) { result()()(0) = - even_0 - odd_0; result()()(1) = - even_1 - odd_1; @@ -218,9 +221,10 @@ void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, template -void StaggeredKernels::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, +template +void StaggeredKernels::DhopSiteHandInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { typedef typename Simd::scalar_type S; @@ -253,8 +257,9 @@ void StaggeredKernels::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, StencilEntry *SE; int skew; - for(int s=0;s::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd); HAND_STENCIL_LEG_INT(U,Zm,1,skew,even); HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd); + if (Naik) { skew = 8; HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even); HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd); @@ -277,7 +283,7 @@ void StaggeredKernels::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd); HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even); HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd); - + } // Assume every site must be connected to at least one interior point. No 1^4 subvols. if ( dag ) { result()()(0) = - even_0 - odd_0; @@ -294,9 +300,10 @@ void StaggeredKernels::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, template -void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, +template +void StaggeredKernels::DhopSiteHandExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { typedef typename Simd::scalar_type S; @@ -329,8 +336,9 @@ void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, StencilEntry *SE; int skew; - for(int s=0;s::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd); HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even); HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd); + if (Naik) { skew = 8; HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even); HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd); @@ -353,7 +362,7 @@ void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd); HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even); HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd); - + } // Add sum of all exterior connected stencil legs if ( nmu ) { if ( dag ) { @@ -370,6 +379,7 @@ void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, } } +/* #define DHOP_SITE_HAND_INSTANTIATE(IMPL) \ template void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \ DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ @@ -385,7 +395,7 @@ void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ SiteSpinor *buf, int LLs, int sU, \ const FermionFieldView &in, FermionFieldView &out, int dag); \ - +*/ #undef LOAD_CHI NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h index d301556c..d7abef27 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h @@ -37,9 +37,9 @@ NAMESPACE_BEGIN(Grid); if (SE->_is_local ) { \ if (SE->_permute) { \ chi_p = χ \ - permute(chi, in[SE->_offset], ptype); \ + permute(chi, in[SE->_offset], ptype); \ } else { \ - chi_p = &in[SE->_offset]; \ + chi_p = &in[SE->_offset]; \ } \ } else { \ chi_p = &buf[SE->_offset]; \ @@ -51,15 +51,15 @@ NAMESPACE_BEGIN(Grid); if (SE->_is_local ) { \ if (SE->_permute) { \ chi_p = χ \ - permute(chi, in[SE->_offset], ptype); \ + permute(chi, in[SE->_offset], ptype); \ } else { \ - chi_p = &in[SE->_offset]; \ + chi_p = &in[SE->_offset]; \ } \ } else if ( st.same_node[Dir] ) { \ chi_p = &buf[SE->_offset]; \ } \ if (SE->_is_local || st.same_node[Dir] ) { \ - multLink(Uchi, U[sU], *chi_p, Dir); \ + multLink(Uchi, U[sU], *chi_p, Dir); \ } #define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \ @@ -67,7 +67,7 @@ NAMESPACE_BEGIN(Grid); if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \ nmu++; \ chi_p = &buf[SE->_offset]; \ - multLink(Uchi, U[sU], *chi_p, Dir); \ + multLink(Uchi, U[sU], *chi_p, Dir); \ } template @@ -78,10 +78,12 @@ StaggeredKernels::StaggeredKernels(const ImplParams &p) : Base(p){}; // Int, Ext, Int+Ext cases for comms overlap //////////////////////////////////////////////////////////////////////////////////// template -void StaggeredKernels::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, +template +void StaggeredKernels::DhopSiteGeneric(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, - const FermionFieldView &in, FermionFieldView &out, int dag) { + SiteSpinor *buf, int sF, int sU, + const FermionFieldView &in, FermionFieldView &out, int dag) +{ const SiteSpinor *chi_p; SiteSpinor chi; SiteSpinor Uchi; @@ -89,8 +91,10 @@ void StaggeredKernels::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, int ptype; int skew; - for(int s=0;s::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd); + if ( Naik ) { skew=8; GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd); @@ -109,6 +114,7 @@ void StaggeredKernels::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd); + } if ( dag ) { Uchi = - Uchi; } @@ -120,9 +126,10 @@ void StaggeredKernels::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, // Only contributions from interior of our node /////////////////////////////////////////////////// template -void StaggeredKernels::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, +template +void StaggeredKernels::DhopSiteGenericInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { const SiteSpinor *chi_p; SiteSpinor chi; @@ -131,8 +138,9 @@ void StaggeredKernels::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder & int ptype; int skew ; - for(int s=0;s::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder & GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd); + if ( Naik ) { skew=8; GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd); @@ -152,6 +161,7 @@ void StaggeredKernels::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder & GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd); + } if ( dag ) { Uchi = - Uchi; } @@ -164,9 +174,10 @@ void StaggeredKernels::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder & // Only contributions from exterior of our node /////////////////////////////////////////////////// template -void StaggeredKernels::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, +template +void StaggeredKernels::DhopSiteGenericExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { const SiteSpinor *chi_p; // SiteSpinor chi; @@ -176,8 +187,9 @@ void StaggeredKernels::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder & int nmu=0; int skew ; - for(int s=0;s::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder & GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd); + if ( Naik ) { skew=8; GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd); @@ -197,7 +210,7 @@ void StaggeredKernels::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder & GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd); - + } if ( nmu ) { if ( dag ) { out[sF] = out[sF] - Uchi; @@ -211,72 +224,9 @@ void StaggeredKernels::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder & //////////////////////////////////////////////////////////////////////////////////// // Driving / wrapping routine to select right kernel //////////////////////////////////////////////////////////////////////////////////// - template -void StaggeredKernels::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, - const FermionFieldView &in, FermionFieldView &out, - int interior,int exterior) -{ - int dag=1; - DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior); -}; - -template -void StaggeredKernels::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, - const FermionFieldView &in, FermionFieldView &out, - int interior,int exterior) -{ - int dag=0; - DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior); -}; - -template -void StaggeredKernels::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionFieldView &in, FermionFieldView &out, - int dag,int interior,int exterior) -{ - switch(Opt) { -#ifdef AVX512 - case OptInlineAsm: - if ( interior && exterior ) { - DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag); - } else { - std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldVi assert(0); } +#define KERNEL_CALLNB(A,improved) \ + const uint64_t NN = Nsite*Ls; \ + accelerator_forNB( ss, NN, Simd::Nsimd(), { \ + int sF = ss; \ + int sU = ss/Ls; \ + ThisKernel:: template A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \ + }); + +#define KERNEL_CALL(A,improved) KERNEL_CALLNB(A,improved); accelerator_barrier(); + +#define ASM_CALL(A) \ + const uint64_t NN = Nsite*Ls; \ + thread_for( ss, NN, { \ + int sF = ss; \ + int sU = ss/Ls; \ + ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \ + }); + +template +void StaggeredKernels::DhopImproved(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + const FermionField &in, FermionField &out, int dag, int interior,int exterior) +{ + GridBase *FGrid=in.Grid(); + GridBase *UGrid=U.Grid(); + typedef StaggeredKernels ThisKernel; + auto UUU_v = UUU.View(AcceleratorRead); + auto U_v = U.View(AcceleratorRead); + auto in_v = in.View(AcceleratorRead); + auto out_v = out.View(AcceleratorWrite); + auto st_v = st.View(AcceleratorRead); + SiteSpinor * buf = st.CommBuf(); + + int Ls=1; + if(FGrid->Nd()==UGrid->Nd()+1){ + Ls = FGrid->_rdimensions[0]; + } + int Nsite = UGrid->oSites(); + + if( interior && exterior ) { + if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,1); return;} +#ifndef GRID_CUDA + if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;} + if (Opt == OptInlineAsm ) { ASM_CALL(DhopSiteAsm); return;} +#endif + } else if( interior ) { + if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,1); return;} +#ifndef GRID_CUDA + if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1); return;} +#endif + } else if( exterior ) { + if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,1); return;} +#ifndef GRID_CUDA + if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1); return;} +#endif + } + assert(0 && " Kernel optimisation case not covered "); +} +template +void StaggeredKernels::DhopNaive(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag, int interior,int exterior) +{ + GridBase *FGrid=in.Grid(); + GridBase *UGrid=U.Grid(); + typedef StaggeredKernels ThisKernel; + auto UUU_v= U.View(AcceleratorRead); + auto U_v = U.View(AcceleratorRead); + auto in_v = in.View(AcceleratorRead); + auto out_v = out.View(AcceleratorWrite); + auto st_v = st.View(AcceleratorRead); + SiteSpinor * buf = st.CommBuf(); + + int Ls=1; + if(FGrid->Nd()==UGrid->Nd()+1){ + Ls = FGrid->_rdimensions[0]; + } + int Nsite = UGrid->oSites(); + + if( interior && exterior ) { + if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,0); return;} +#ifndef GRID_CUDA + if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0); return;} +#endif + } else if( interior ) { + if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,0); return;} +#ifndef GRID_CUDA + if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0); return;} +#endif + } else if( exterior ) { + if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,0); return;} +#ifndef GRID_CUDA + if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0); return;} +#endif + } +} + + +#undef KERNEL_CALLNB +#undef KERNEL_CALL +#undef ASM_CALL + NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 587bf42c..8f8c1063 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -496,5 +496,9 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField assert(0 && " Kernel optimisation case not covered "); } +#undef KERNEL_CALLNB +#undef KERNEL_CALL +#undef ASM_CALL + NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc b/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc new file mode 100644 index 00000000..c424cb2d --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc @@ -0,0 +1,36 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc + +Copyright (C) 2015 + +Author: Azusa Yamaguchi, Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ + /* END LEGAL */ +#include + +NAMESPACE_BEGIN(Grid); + +const std::vector NaiveStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3}); +const std::vector NaiveStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1}); + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc.master b/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc.master new file mode 100644 index 00000000..75b75678 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc.master @@ -0,0 +1,37 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/NaiveStaggeredFermion.cc + +Copyright (C) 2015 + +Author: Azusa Yamaguchi, Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ + /* END LEGAL */ +#include +#include + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class NaiveStaggeredFermion; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/StaggeredImplD/NaiveStaggeredFermionInstantiationStaggeredImplD.cc b/Grid/qcd/action/fermion/instantiation/StaggeredImplD/NaiveStaggeredFermionInstantiationStaggeredImplD.cc new file mode 120000 index 00000000..42057f56 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/StaggeredImplD/NaiveStaggeredFermionInstantiationStaggeredImplD.cc @@ -0,0 +1 @@ +../NaiveStaggeredFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/StaggeredImplF/NaiveStaggeredFermionInstantiationStaggeredImplF.cc b/Grid/qcd/action/fermion/instantiation/StaggeredImplF/NaiveStaggeredFermionInstantiationStaggeredImplF.cc new file mode 120000 index 00000000..42057f56 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/StaggeredImplF/NaiveStaggeredFermionInstantiationStaggeredImplF.cc @@ -0,0 +1 @@ +../NaiveStaggeredFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh index 330dcfa8..72a9eaf9 100755 --- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh +++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh @@ -88,6 +88,7 @@ done CC_LIST=" \ ImprovedStaggeredFermion5DInstantiation \ ImprovedStaggeredFermionInstantiation \ + NaiveStaggeredFermionInstantiation \ StaggeredKernelsInstantiation " for impl in $STAG_IMPL_LIST diff --git a/benchmarks/Benchmark_staggered.cc b/benchmarks/Benchmark_staggered.cc index 93086927..e2fcd8f2 100644 --- a/benchmarks/Benchmark_staggered.cc +++ b/benchmarks/Benchmark_staggered.cc @@ -87,26 +87,6 @@ int main (int argc, char ** argv) for(int mu=0;mu(Umu,mu); } - ref = Zero(); - /* - { // Naive wilson implementation - ref = Zero(); - for(int mu=0;mu +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + ; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + Coordinate latt_size = GridDefaultLatt(); + Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian RBGrid(&Grid); + + int threads = GridThread::GetThreads(); + std::cout< seeds({1,2,3,4}); + GridParallelRNG pRNG(&Grid); + pRNG.SeedFixedIntegers(seeds); + // pRNG.SeedFixedIntegers(std::vector({45,12,81,9}); + + typedef typename NaiveStaggeredFermionR::FermionField FermionField; + typedef typename NaiveStaggeredFermionR::ComplexField ComplexField; + typename NaiveStaggeredFermionR::ImplParams params; + + FermionField src (&Grid); random(pRNG,src); + FermionField result(&Grid); result=Zero(); + FermionField ref(&Grid); ref=Zero(); + FermionField tmp(&Grid); tmp=Zero(); + FermionField err(&Grid); tmp=Zero(); + FermionField phi (&Grid); random(pRNG,phi); + FermionField chi (&Grid); random(pRNG,chi); + LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + std::vector U(4,&Grid); + + + double volume=1; + for(int mu=0;mu(Umu,mu); + /* Debug force unit + U[mu] = 1.0; + PokeIndex(Umu,U[mu],mu); + */ + } + + ref = Zero(); + + RealD mass=0.1; + RealD c1=9.0/8.0; + RealD u0=1.0; + + { // Simple improved staggered implementation + ref = Zero(); + RealD c1tad = 0.5*c1/u0; + + Lattice > coor(&Grid); + + Lattice > x(&Grid); LatticeCoordinate(x,0); + Lattice > y(&Grid); LatticeCoordinate(y,1); + Lattice > z(&Grid); LatticeCoordinate(z,2); + Lattice > t(&Grid); LatticeCoordinate(t,3); + + Lattice > lin_z(&Grid); lin_z=x+y; + Lattice > lin_t(&Grid); lin_t=x+y+z; + + for(int mu=0;mu * = < chi | Deo^dag| phi> "< HermOpEO(Ds); + HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); + HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); + + HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2); + HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2); + + pDce = innerProduct(phi_e,dchi_e); + pDco = innerProduct(phi_o,dchi_o); + cDpe = innerProduct(chi_e,dphi_e); + cDpo = innerProduct(chi_o,dphi_o); + + std::cout<