From abc658dca5bbee1cc132f4b537561f3925eea9d0 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Tue, 20 Jun 2023 16:14:25 -0400 Subject: [PATCH 1/9] Added coalescedReadGeneralPermute CPU implementation based on Christoph's GPT code In a test code, implemented a padded-cell version of the staple and rectangular-staple calculation --- Grid/qcd/utils/CovariantCshift.h | 9 +- Grid/tensors/Tensor_SIMT.h | 10 + tests/debug/Test_padded_cell_staple.cc | 531 +++++++++++++++++++++++++ 3 files changed, 546 insertions(+), 4 deletions(-) create mode 100644 tests/debug/Test_padded_cell_staple.cc diff --git a/Grid/qcd/utils/CovariantCshift.h b/Grid/qcd/utils/CovariantCshift.h index 23984145..583c0f2b 100644 --- a/Grid/qcd/utils/CovariantCshift.h +++ b/Grid/qcd/utils/CovariantCshift.h @@ -37,13 +37,14 @@ NAMESPACE_BEGIN(Grid); // Make these members of an Impl class for BC's. namespace PeriodicBC { - + //Out(x) = Link(x)*field(x+mu) template Lattice CovShiftForward(const Lattice &Link, int mu, const Lattice &field) { return Link*Cshift(field,mu,1);// moves towards negative mu } + //Out(x) = Link^dag(x-mu)*field(x-mu) template Lattice CovShiftBackward(const Lattice &Link, int mu, const Lattice &field) @@ -52,19 +53,19 @@ namespace PeriodicBC { tmp = adj(Link)*field; return Cshift(tmp,mu,-1);// moves towards positive mu } - + //Out(x) = Link^dag(x-mu) template Lattice CovShiftIdentityBackward(const Lattice &Link, int mu) { return Cshift(adj(Link), mu, -1); } - + //Out(x) = Link(x) template Lattice CovShiftIdentityForward(const Lattice &Link, int mu) { return Link; } - + //Link(x) = Link(x+mu) template Lattice ShiftStaple(const Lattice &Link, int mu) { diff --git a/Grid/tensors/Tensor_SIMT.h b/Grid/tensors/Tensor_SIMT.h index 8015d74c..715735c9 100644 --- a/Grid/tensors/Tensor_SIMT.h +++ b/Grid/tensors/Tensor_SIMT.h @@ -73,6 +73,16 @@ vobj coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int return vec; } } +//'perm_mask' acts as a bitmask +template accelerator_inline +vobj coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=0) +{ + auto obj = vec, tmp = vec; + for (int d=0;d accelerator_inline void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0) { diff --git a/tests/debug/Test_padded_cell_staple.cc b/tests/debug/Test_padded_cell_staple.cc new file mode 100644 index 00000000..5a7f412a --- /dev/null +++ b/tests/debug/Test_padded_cell_staple.cc @@ -0,0 +1,531 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_padded_cell_staple.cc + + Copyright (C) 2015 + +Author: Christopher Kelly +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include + +using namespace std; +using namespace Grid; + +template class WilsonLoopsTest : public Gimpl { +public: + INHERIT_GIMPL_TYPES(Gimpl); + + typedef typename Gimpl::GaugeLinkField GaugeMat; + typedef typename Gimpl::GaugeField GaugeLorentz; + + + //Original implementation + static void StapleOrig(GaugeMat &staple, const GaugeLorentz &Umu, int mu, + int nu) { + + GridBase *grid = Umu.Grid(); + + std::vector U(Nd, grid); + for (int d = 0; d < Nd; d++) { + U[d] = PeekIndex(Umu, d); + } + staple = Zero(); + + if (nu != mu) { + + // mu + // ^ + // |__> nu + + // __ + // | + // __| + // + + //Forward: Out(x) = Link(x)*field(x+mu) + //Backward: Out(x) = Link^dag(x-mu)*field(x-mu) + //ShiftStaple: Link(x) = Link(x+mu) + + //tmp1 = U^dag_nu(x-nu) + //tmp2 = U^dag_mu(x-mu) tmp1(x-mu) = U^dag_mu(x-mu) U^dag_nu(x-nu-mu) + //tmp3 = U_nu(x) tmp2(x+nu) = U_nu(x)U^dag_mu(x-mu+nu) U^dag_nu(x-mu) + //tmp4 = tmp(x+mu) = U_nu(x+mu)U^dag_mu(x+nu) U^dag_nu(x) + + staple += Gimpl::ShiftStaple( + Gimpl::CovShiftForward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))), + mu); + + // __ + // | + // |__ + // + // + + //tmp1 = U_mu^dag(x-mu) U_nu(x-mu) + //tmp2 = U_nu^dag(x-nu) tmp1(x-nu) = U_nu^dag(x-nu) U_mu^dag(x-mu-nu) U_nu(x-mu-nu) + //tmp3 = tmp2(x+mu) = U_nu^dag(x-nu+mu) U_mu^dag(x-nu) U_nu(x-nu) + staple += Gimpl::ShiftStaple( + Gimpl::CovShiftBackward(U[nu], nu, + Gimpl::CovShiftBackward(U[mu], mu, U[nu])), + mu); + } + } + + static void StaplePadded(GaugeMat &staple, const GaugeLorentz &U, int mu, + int nu) { + if(nu==mu){ + staple = Zero(); + return; + } + PaddedCell Ghost(1,U.Grid()); + + GaugeMat U_mu = PeekIndex(U, mu); + GaugeMat U_nu = PeekIndex(U, nu); + + GaugeMat Ug_mu = Ghost.Exchange(U_mu); + GaugeMat Ug_nu = Ghost.Exchange(U_nu); + GridBase *ggrid = Ug_mu.Grid(); + + GaugeMat gStaple(ggrid); + + Coordinate shift_0(Nd,0); + Coordinate shift_mu(Nd,0); shift_mu[mu]=1; + Coordinate shift_nu(Nd,0); shift_nu[nu]=1; + Coordinate shift_mnu(Nd,0); shift_mnu[nu]=-1; + Coordinate shift_mnu_pmu(Nd,0); shift_mnu_pmu[nu]=-1; shift_mnu_pmu[mu]=1; + + std::vector shifts; + + //U_nu(x+mu)U^dag_mu(x+nu) U^dag_nu(x) + shifts.push_back(shift_0); + shifts.push_back(shift_nu); + shifts.push_back(shift_mu); + + //U_nu^dag(x-nu+mu) U_mu^dag(x-nu) U_nu(x-nu) + shifts.push_back(shift_mnu); + shifts.push_back(shift_mnu); + shifts.push_back(shift_mnu_pmu); + + GeneralLocalStencil gStencil(ggrid,shifts); + autoView( gStaple_v , gStaple, AcceleratorWrite); + auto gStencil_v = gStencil.View(); + autoView( Ug_mu_v , Ug_mu, AcceleratorRead); + autoView( Ug_nu_v , Ug_nu, AcceleratorRead); + + accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), { + GeneralStencilEntry const* e = gStencil_v.GetEntry(0,ss); + auto Udag_nu_x = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(1,ss); + auto Udag_mu_xpnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(2,ss); + auto U_nu_xpmu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd); + + auto stencil_ss = U_nu_xpmu * Udag_mu_xpnu * Udag_nu_x; + + e = gStencil_v.GetEntry(3,ss); + auto U_nu_xmnu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(4,ss); + auto Udag_mu_xmnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(5,ss); + auto Udag_nu_xmnu_pmu = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd)); + + stencil_ss = stencil_ss + Udag_nu_xmnu_pmu * Udag_mu_xmnu * U_nu_xmnu; + + coalescedWrite(gStaple_v[ss],stencil_ss); + } + ); + + staple = Ghost.Extract(gStaple); + } + + static void RectStapleOrig(GaugeMat &Stap, const GaugeLorentz &Umu, + int mu) { + GridBase *grid = Umu.Grid(); + + std::vector U(Nd, grid); + for (int d = 0; d < Nd; d++) { + U[d] = PeekIndex(Umu, d); + } + + Stap = Zero(); + + for (int nu = 0; nu < Nd; nu++) { + if (nu != mu) { + // __ ___ + // | __ | + // + //tmp1 = U_nu^dag(x-nu) + //tmp2 = U_mu^dag(x-mu)tmp1(x-mu) = U_mu^dag(x-mu) U_nu^dag(x-nu-mu) + //tmp3 = U_mu^dag(x-mu)tmp2(x-mu) = U_mu^dag(x-mu) U_mu^dag(x-2mu) U_nu^dag(x-nu-2mu) + //tmp4 = U_nu(x)tmp3(x+nu) = U_nu(x)U_mu^dag(x-mu+nu) U_mu^dag(x-2mu+nu) U_nu^dag(x-2mu) + //tmp5 = U_mu(x)tmp4(x+mu) = U_mu(x)U_nu(x+mu)U_mu^dag(x+nu) U_mu^dag(x-mu+nu) U_nu^dag(x-mu) + //tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x) + + Stap += Gimpl::ShiftStaple( + Gimpl::CovShiftForward( + U[mu], mu, + Gimpl::CovShiftForward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[mu], mu, + Gimpl::CovShiftBackward( + U[mu], mu, + Gimpl::CovShiftIdentityBackward(U[nu], nu))))), + mu); + + // __ + // |__ __ | + + //tmp1 = U^dag_mu(x-mu)U_nu(x-mu) + //tmp2 = U^dag_mu(x-mu)tmp1(x-mu) = U^dag_mu(x-mu)U^dag_mu(x-2mu)U_nu(x-2mu) + //tmp3 = U^dag_nu(x-nu)tmp2(x-nu) = U^dag_nu(x-nu)U^dag_mu(x-mu-nu)U^dag_mu(x-2mu-nu)U_nu(x-2mu-nu) + //tmp4 = U_mu(x)tmp3(x+mu) = U_mu(x)U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu) + //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu) + + Stap += Gimpl::ShiftStaple( + Gimpl::CovShiftForward( + U[mu], mu, + Gimpl::CovShiftBackward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[mu], mu, Gimpl::CovShiftBackward(U[mu], mu, U[nu])))), + mu); + + // __ + // |__ __ | + //Forward: Out(x) = Link(x)*field(x+mu) + //Backward: Out(x) = Link^dag(x-mu)*field(x-mu) + //ShiftStaple: Link(x) = Link(x+mu) + + //tmp1 = U_nu(x)U_mu(x+nu) + //tmp2 = U^dag_mu(x-mu)tmp1(x-mu) = U^dag_mu(x-mu)U_nu(x-mu)U_mu(x+nu-mu) + //tmp3 = U^dag_mu(x-mu)tmp2(x-mu) = U^dag_mu(x-mu)U^dag_mu(x-2mu)U_nu(x-2mu)U_mu(x+nu-2mu) + //tmp4 = U^dag_nu(x-nu)tmp3(x-nu) = U^dag_nu(x-nu)U^dag_mu(x-mu-nu)U^dag_mu(x-2mu-nu)U_nu(x-2mu-nu)U_mu(x-2mu) + //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu) + Stap += Gimpl::ShiftStaple( + Gimpl::CovShiftBackward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[mu], mu, + Gimpl::CovShiftBackward( + U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[mu])))), + mu); + + // __ ___ + // |__ | + //tmp1 = U_nu^dag(x-nu)U_mu(x-nu) + //tmp2 = U_mu^dag(x-mu)tmp1(x-mu) = U_mu^dag(x-mu)U_nu^dag(x-mu-nu)U_mu(x-mu-nu) + //tmp3 = U_mu^dag(x-mu)tmp2(x-mu) = U_mu^dag(x-mu)U_mu^dag(x-2mu)U_nu^dag(x-2mu-nu)U_mu(x-2mu-nu) + //tmp4 = U_nu(x)tmp3(x+nu) = U_nu(x)U_mu^dag(x-mu+nu)U_mu^dag(x-2mu+nu)U_nu^dag(x-2mu)U_mu(x-2mu) + //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu) + Stap += Gimpl::ShiftStaple( + Gimpl::CovShiftForward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[mu], mu, + Gimpl::CovShiftBackward( + U[mu], mu, Gimpl::CovShiftBackward(U[nu], nu, U[mu])))), + mu); + + // -- + // | | + // + // | | + //tmp1 = U_nu^dag(x-nu) + //tmp2 = U_nu^dag(x-nu)tmp1(x-nu) = U_nu^dag(x-nu)U_nu^dag(x-2nu) + //tmp3 = U_mu^dag(x-mu)tmp2(x-mu) = U_mu^dag(x-mu)U_nu^dag(x-mu-nu)U_nu^dag(x-mu-2nu) + //tmp4 = U_nu(x)tmp3(x+nu) = U_nu(x)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_nu^dag(x-mu-nu) + //tmp5 = U_nu(x)tmp4(x+nu) = U_nu(x)U_nu(x+nu)U_mu^dag(x-mu+2nu)U_nu^dag(x-mu+nu)U_nu^dag(x-mu) + //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x) + Stap += Gimpl::ShiftStaple( + Gimpl::CovShiftForward( + U[nu], nu, + Gimpl::CovShiftForward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[mu], mu, + Gimpl::CovShiftBackward( + U[nu], nu, + Gimpl::CovShiftIdentityBackward(U[nu], nu))))), + mu); + + // | | + // + // | | + // -- + //tmp1 = U_nu(x)U_nu(x+nu) + //tmp2 = U_mu^dag(x-mu)tmp1(x-mu) = U_mu^dag(x-mu)U_nu(x-mu)U_nu(x-mu+nu) + //tmp3 = U_nu^dag(x-nu)tmp2(x-nu) = U_nu^dag(x-nu)U_mu^dag(x-mu-nu)U_nu(x-mu-nu)U_nu(x-mu) + //tmp4 = U_nu^dag(x-nu)tmp3(x-nu) = U_nu^dag(x-nu)U_nu^dag(x-2nu)U_mu^dag(x-mu-2nu)U_nu(x-mu-2nu)U_nu(x-mu-nu) + //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu) + Stap += Gimpl::ShiftStaple( + Gimpl::CovShiftBackward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[nu])))), + mu); + } + } + } + + + static void RectStaplePadded(GaugeMat &Stap, const GaugeLorentz &U, + int mu) { + PaddedCell Ghost(2,U.Grid()); + GaugeLorentz Ug = Ghost.Exchange(const_cast(U)); + GridBase *ggrid = Ug.Grid(); + + std::vector Ug_dirs(Nd,ggrid); + for(int i=0;i(Ug, i); + + GaugeMat gStaple(ggrid); + + std::vector shifts; + for (int nu = 0; nu < Nd; nu++) { + if (nu != mu) { + auto genShift = [&](int mushift,int nushift){ + Coordinate out(Nd,0); out[mu]=mushift; out[nu]=nushift; return out; + }; + + //tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x) + shifts.push_back(genShift(0,0)); + shifts.push_back(genShift(0,+1)); + shifts.push_back(genShift(+1,+1)); + shifts.push_back(genShift(+2,0)); + shifts.push_back(genShift(+1,0)); + + //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu) + shifts.push_back(genShift(0,-1)); + shifts.push_back(genShift(0,-1)); + shifts.push_back(genShift(+1,-1)); + shifts.push_back(genShift(+2,-1)); + shifts.push_back(genShift(+1,0)); + + //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu) + shifts.push_back(genShift(-1,0)); + shifts.push_back(genShift(-1,-1)); + shifts.push_back(genShift(-1,-1)); + shifts.push_back(genShift(0,-1)); + shifts.push_back(genShift(+1,-1)); + + //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu) + shifts.push_back(genShift(-1,0)); + shifts.push_back(genShift(-1,0)); + shifts.push_back(genShift(-1,+1)); + shifts.push_back(genShift(0,+1)); + shifts.push_back(genShift(+1,0)); + + //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x) + shifts.push_back(genShift(0,0)); + shifts.push_back(genShift(0,+1)); + shifts.push_back(genShift(0,+2)); + shifts.push_back(genShift(+1,+1)); + shifts.push_back(genShift(+1,0)); + + //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu) + shifts.push_back(genShift(0,-1)); + shifts.push_back(genShift(0,-2)); + shifts.push_back(genShift(0,-2)); + shifts.push_back(genShift(+1,-2)); + shifts.push_back(genShift(+1,-1)); + } + } + size_t nshift = shifts.size(); + + GeneralLocalStencil gStencil(ggrid,shifts); + autoView( gStaple_v , gStaple, AcceleratorWrite); + auto gStencil_v = gStencil.View(); + + typedef LatticeView GaugeViewType; + size_t vsize = Nd*sizeof(GaugeViewType); + GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize); + for(int i=0;ioSites(), ggrid->Nsimd(), { + decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss; + stencil_ss = Zero(); + int s=0; + for(int nu=0;nu_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + auto U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + auto U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + auto U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x) + e = gStencil_v.GetEntry(s++,ss); + U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + } + } + assert(s==nshift); + coalescedWrite(gStaple_v[ss],stencil_ss); + } + ); + + Stap = Ghost.Extract(gStaple); + for(int i=0;i({45,12,81,9})); + LatticeGaugeField U(&GRID); + + SU::HotConfiguration(pRNG,U); + + typedef PeriodicGimplD Gimpl; + typedef typename WilsonLoopsTest::GaugeMat GaugeMat; + typedef typename WilsonLoopsTest::GaugeLorentz GaugeLorentz; + +#if 0 + std::cout << "Checking Staple" << std::endl; + for(int mu=0;mu::StapleOrig(staple_orig,U,mu,nu); + WilsonLoopsTest::StaplePadded(staple_padded,U,mu,nu); + GaugeMat diff = staple_orig - staple_padded; + double n = norm2(diff); + std::cout << mu << " " << nu << " " << n << std::endl; + assert(n<1e-10); + } + } + } +#endif + + std::cout << "Checking RectStaple" << std::endl; + for(int mu=0;mu::RectStapleOrig(staple_orig,U,mu); + WilsonLoopsTest::RectStaplePadded(staple_padded,U,mu); + GaugeMat diff = staple_orig - staple_padded; + double n = norm2(diff); + std::cout << mu << " " << n << std::endl; + assert(n<1e-10); + } + + Grid_finalize(); +} From 7b110751021514e4ec2ef1aa79a36ed970475864 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Tue, 20 Jun 2023 17:09:56 -0400 Subject: [PATCH 2/9] The user can now specify the implementation of Cshift used by the PaddedCell class through a virtual base class API. Implementations for default (regular Cshift) and for gauge links (which respects the gauge BCs) Fixed const-correctness for PaddedCell and ConjugateGimpl::setDirections Modified test code for padded-cell implementation of staple, rect-staple to use cconj BCs --- Grid/lattice/PaddedCell.h | 28 +++++++++++++++----- Grid/qcd/action/gauge/GaugeImplementations.h | 2 +- tests/debug/Test_padded_cell_staple.cc | 23 +++++++++------- 3 files changed, 36 insertions(+), 17 deletions(-) diff --git a/Grid/lattice/PaddedCell.h b/Grid/lattice/PaddedCell.h index 863d361a..2efd2288 100644 --- a/Grid/lattice/PaddedCell.h +++ b/Grid/lattice/PaddedCell.h @@ -28,12 +28,28 @@ Author: Peter Boyle pboyle@bnl.gov NAMESPACE_BEGIN(Grid); +//Allow the user to specify how the C-shift is performed, e.g. to respect the appropriate boundary conditions +template +struct CshiftImplBase{ + virtual Lattice Cshift(const Lattice &in, int dir, int shift) const = 0; + virtual ~CshiftImplBase(){} +}; +template +struct CshiftImplDefault: public CshiftImplBase{ + Lattice Cshift(const Lattice &in, int dir, int shift) const override{ return Grid::Cshift(in,dir,shift); } +}; +template +struct CshiftImplGauge: public CshiftImplBase{ + typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); } +}; + class PaddedCell { public: GridCartesian * unpadded_grid; int dims; int depth; std::vector grids; + ~PaddedCell() { DeleteGrids(); @@ -77,7 +93,7 @@ public: } }; template - inline Lattice Extract(Lattice &in) + inline Lattice Extract(const Lattice &in) { Lattice out(unpadded_grid); @@ -88,19 +104,19 @@ public: return out; } template - inline Lattice Exchange(Lattice &in) + inline Lattice Exchange(const Lattice &in, const CshiftImplBase &cshift = CshiftImplDefault()) { GridBase *old_grid = in.Grid(); int dims = old_grid->Nd(); Lattice tmp = in; for(int d=0;d - inline Lattice Expand(int dim,Lattice &in) + inline Lattice Expand(int dim, const Lattice &in, const CshiftImplBase &cshift = CshiftImplDefault()) { GridBase *old_grid = in.Grid(); GridCartesian *new_grid = grids[dim];//These are new grids @@ -117,12 +133,12 @@ public: InsertSliceLocal(in,padded,x,depth+x,dim); } // High bit - shifted = Cshift(in,dim,depth); + shifted = cshift.Cshift(in,dim,depth); for(int x=0;x &conjDirs) { _conjDirs=conjDirs; } + static inline void setDirections(const std::vector &conjDirs) { _conjDirs=conjDirs; } static inline std::vector getDirections(void) { return _conjDirs; } static inline bool isPeriodicGaugeField(void) { return false; } }; diff --git a/tests/debug/Test_padded_cell_staple.cc b/tests/debug/Test_padded_cell_staple.cc index 5a7f412a..bed00cad 100644 --- a/tests/debug/Test_padded_cell_staple.cc +++ b/tests/debug/Test_padded_cell_staple.cc @@ -106,9 +106,10 @@ public: GaugeMat U_mu = PeekIndex(U, mu); GaugeMat U_nu = PeekIndex(U, nu); - - GaugeMat Ug_mu = Ghost.Exchange(U_mu); - GaugeMat Ug_nu = Ghost.Exchange(U_nu); + + CshiftImplGauge cshift_impl; + GaugeMat Ug_mu = Ghost.Exchange(U_mu, cshift_impl); + GaugeMat Ug_nu = Ghost.Exchange(U_nu, cshift_impl); GridBase *ggrid = Ug_mu.Grid(); GaugeMat gStaple(ggrid); @@ -299,11 +300,11 @@ public: static void RectStaplePadded(GaugeMat &Stap, const GaugeLorentz &U, int mu) { PaddedCell Ghost(2,U.Grid()); - GaugeLorentz Ug = Ghost.Exchange(const_cast(U)); - GridBase *ggrid = Ug.Grid(); - + GridBase *ggrid = Ghost.grids.back(); + + CshiftImplGauge cshift_impl; std::vector Ug_dirs(Nd,ggrid); - for(int i=0;i(Ug, i); + for(int i=0;i(U, i), cshift_impl); GaugeMat gStaple(ggrid); @@ -495,11 +496,14 @@ int main (int argc, char ** argv) SU::HotConfiguration(pRNG,U); - typedef PeriodicGimplD Gimpl; + //typedef PeriodicGimplD Gimpl; + typedef ConjugateGimplD Gimpl; + std::vector conj_dirs(Nd,0); conj_dirs[0]=1; conj_dirs[3]=1; + Gimpl::setDirections(conj_dirs); + typedef typename WilsonLoopsTest::GaugeMat GaugeMat; typedef typename WilsonLoopsTest::GaugeLorentz GaugeLorentz; -#if 0 std::cout << "Checking Staple" << std::endl; for(int mu=0;mu Date: Wed, 21 Jun 2023 16:01:01 -0400 Subject: [PATCH 3/9] Imported coalescedReadGeneralPermute GPU implementation from Christoph Fixed bug in padded staple code where extract was being called on the result before the GPU view was closed Fixed compile issue with pointer cast in padded staple code Added timing summaries of padded staple code and timing breakdown of staple implementation to Test_padded_cell_staple --- Grid/tensors/Tensor_SIMT.h | 10 +- tests/debug/Test_padded_cell_staple.cc | 306 ++++++++++++++----------- 2 files changed, 185 insertions(+), 131 deletions(-) diff --git a/Grid/tensors/Tensor_SIMT.h b/Grid/tensors/Tensor_SIMT.h index 715735c9..9be2c500 100644 --- a/Grid/tensors/Tensor_SIMT.h +++ b/Grid/tensors/Tensor_SIMT.h @@ -93,7 +93,7 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ { vstream(vec, extracted); } -#else +#else //==GRID_SIMT //#ifndef GRID_SYCL @@ -176,6 +176,14 @@ typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec, return extractLane(plane,vec); } template accelerator_inline +typename vobj::scalar_object coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=acceleratorSIMTlane(vobj::Nsimd())) +{ + int plane = lane; + for (int d=0;d> (d + 1)) : plane; + return extractLane(plane,vec); +} +template accelerator_inline void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=acceleratorSIMTlane(vobj::Nsimd())) { insertLane(lane,vec,extracted); diff --git a/tests/debug/Test_padded_cell_staple.cc b/tests/debug/Test_padded_cell_staple.cc index bed00cad..6d7bc8c9 100644 --- a/tests/debug/Test_padded_cell_staple.cc +++ b/tests/debug/Test_padded_cell_staple.cc @@ -102,18 +102,31 @@ public: staple = Zero(); return; } - PaddedCell Ghost(1,U.Grid()); + double peek = 0, construct = 0, exchange = 0, coord = 0, stencil =0, kernel = 0, extract = 0, total = 0; + + double tstart = usecond(); + double t=tstart; + + PaddedCell Ghost(1, (GridCartesian*)U.Grid()); + construct += usecond() - t; + + t=usecond(); GaugeMat U_mu = PeekIndex(U, mu); GaugeMat U_nu = PeekIndex(U, nu); + peek += usecond() - t; + t=usecond(); CshiftImplGauge cshift_impl; GaugeMat Ug_mu = Ghost.Exchange(U_mu, cshift_impl); GaugeMat Ug_nu = Ghost.Exchange(U_nu, cshift_impl); + exchange += usecond() - t; + GridBase *ggrid = Ug_mu.Grid(); GaugeMat gStaple(ggrid); + t=usecond(); Coordinate shift_0(Nd,0); Coordinate shift_mu(Nd,0); shift_mu[mu]=1; Coordinate shift_nu(Nd,0); shift_nu[nu]=1; @@ -131,37 +144,50 @@ public: shifts.push_back(shift_mnu); shifts.push_back(shift_mnu); shifts.push_back(shift_mnu_pmu); + coord += usecond()-t; + t=usecond(); GeneralLocalStencil gStencil(ggrid,shifts); - autoView( gStaple_v , gStaple, AcceleratorWrite); - auto gStencil_v = gStencil.View(); - autoView( Ug_mu_v , Ug_mu, AcceleratorRead); - autoView( Ug_nu_v , Ug_nu, AcceleratorRead); - - accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), { - GeneralStencilEntry const* e = gStencil_v.GetEntry(0,ss); - auto Udag_nu_x = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(1,ss); - auto Udag_mu_xpnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(2,ss); - auto U_nu_xpmu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd); - - auto stencil_ss = U_nu_xpmu * Udag_mu_xpnu * Udag_nu_x; + stencil += usecond() -t; - e = gStencil_v.GetEntry(3,ss); - auto U_nu_xmnu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd); - e = gStencil_v.GetEntry(4,ss); - auto Udag_mu_xmnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(5,ss); - auto Udag_nu_xmnu_pmu = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd)); - - stencil_ss = stencil_ss + Udag_nu_xmnu_pmu * Udag_mu_xmnu * U_nu_xmnu; - - coalescedWrite(gStaple_v[ss],stencil_ss); - } - ); + t=usecond(); + { + autoView( gStaple_v , gStaple, AcceleratorWrite); + auto gStencil_v = gStencil.View(); + autoView( Ug_mu_v , Ug_mu, AcceleratorRead); + autoView( Ug_nu_v , Ug_nu, AcceleratorRead); + accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), { + GeneralStencilEntry const* e = gStencil_v.GetEntry(0,ss); + auto Udag_nu_x = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(1,ss); + auto Udag_mu_xpnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(2,ss); + auto U_nu_xpmu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd); + + auto stencil_ss = U_nu_xpmu * Udag_mu_xpnu * Udag_nu_x; + + e = gStencil_v.GetEntry(3,ss); + auto U_nu_xmnu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(4,ss); + auto Udag_mu_xmnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(5,ss); + auto Udag_nu_xmnu_pmu = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd)); + + stencil_ss = stencil_ss + Udag_nu_xmnu_pmu * Udag_mu_xmnu * U_nu_xmnu; + + coalescedWrite(gStaple_v[ss],stencil_ss); + } + ); + } //ensure views are all closed! + kernel += usecond() - t; + + t=usecond(); staple = Ghost.Extract(gStaple); + extract += usecond()-t; + + total += usecond() - tstart; + std::cout << GridLogMessage << "StaplePadded timings peek:" << peek << " construct:" << construct << " exchange:" << exchange << " coord:" << coord << " stencil:" << stencil << " kernel:" << kernel << " extract:" << extract << " total:" << total << std::endl; } static void RectStapleOrig(GaugeMat &Stap, const GaugeLorentz &Umu, @@ -299,7 +325,7 @@ public: static void RectStaplePadded(GaugeMat &Stap, const GaugeLorentz &U, int mu) { - PaddedCell Ghost(2,U.Grid()); + PaddedCell Ghost(2,(GridCartesian*)U.Grid()); GridBase *ggrid = Ghost.grids.back(); CshiftImplGauge cshift_impl; @@ -361,117 +387,119 @@ public: size_t nshift = shifts.size(); GeneralLocalStencil gStencil(ggrid,shifts); - autoView( gStaple_v , gStaple, AcceleratorWrite); - auto gStencil_v = gStencil.View(); + { + autoView( gStaple_v , gStaple, AcceleratorWrite); + auto gStencil_v = gStencil.View(); - typedef LatticeView GaugeViewType; - size_t vsize = Nd*sizeof(GaugeViewType); - GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize); - for(int i=0;i GaugeViewType; + size_t vsize = Nd*sizeof(GaugeViewType); + GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize); + for(int i=0;ioSites(), ggrid->Nsimd(), { - decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss; - stencil_ss = Zero(); - int s=0; - for(int nu=0;nu_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - auto U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - auto U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); - e = gStencil_v.GetEntry(s++,ss); - auto U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), { + decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss; + stencil_ss = Zero(); + int s=0; + for(int nu=0;nu_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + auto U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + auto U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + auto U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); - stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; - //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu) - e = gStencil_v.GetEntry(s++,ss); - U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); - e = gStencil_v.GetEntry(s++,ss); - U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); - stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; - //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu) - e = gStencil_v.GetEntry(s++,ss); - U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); - e = gStencil_v.GetEntry(s++,ss); - U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); - e = gStencil_v.GetEntry(s++,ss); - U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); - stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; - //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu) - e = gStencil_v.GetEntry(s++,ss); - U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); - e = gStencil_v.GetEntry(s++,ss); - U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); - stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; - //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x) - e = gStencil_v.GetEntry(s++,ss); - U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); - e = gStencil_v.GetEntry(s++,ss); - U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x) + e = gStencil_v.GetEntry(s++,ss); + U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); - stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; - //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu) - e = gStencil_v.GetEntry(s++,ss); - U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); - e = gStencil_v.GetEntry(s++,ss); - U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); - e = gStencil_v.GetEntry(s++,ss); - U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); - stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + } } + assert(s==nshift); + coalescedWrite(gStaple_v[ss],stencil_ss); } - assert(s==nshift); - coalescedWrite(gStaple_v[ss],stencil_ss); - } - ); + ); - Stap = Ghost.Extract(gStaple); - for(int i=0;i::GaugeMat GaugeMat; typedef typename WilsonLoopsTest::GaugeLorentz GaugeLorentz; - std::cout << "Checking Staple" << std::endl; + std::cout << GridLogMessage << "Checking Staple" << std::endl; + int count = 0; + double torig=0, tpadded=0; + for(int mu=0;mu::StapleOrig(staple_orig,U,mu,nu); + double t0 = usecond(); + WilsonLoopsTest::StapleOrig(staple_orig,U,mu,nu); + double t1 = usecond(); WilsonLoopsTest::StaplePadded(staple_padded,U,mu,nu); + double t2 = usecond(); + torig += t1-t0; tpadded += t2-t1; + ++count; + GaugeMat diff = staple_orig - staple_padded; double n = norm2(diff); - std::cout << mu << " " << nu << " " << n << std::endl; + std::cout << GridLogMessage << mu << " " << nu << " " << n << std::endl; assert(n<1e-10); } } } - - std::cout << "Checking RectStaple" << std::endl; + std::cout << GridLogMessage << "Staple timings orig: " << torig/1000/count << "ms, padded: " << tpadded/1000/count << "ms" << std::endl; + count=0; torig=tpadded=0; + + std::cout << GridLogMessage << "Checking RectStaple" << std::endl; for(int mu=0;mu::RectStapleOrig(staple_orig,U,mu); + double t0 = usecond(); + WilsonLoopsTest::RectStapleOrig(staple_orig,U,mu); + double t1 = usecond(); WilsonLoopsTest::RectStaplePadded(staple_padded,U,mu); + double t2 = usecond(); + torig += t1-t0; tpadded += t2-t1; + ++count; + GaugeMat diff = staple_orig - staple_padded; double n = norm2(diff); - std::cout << mu << " " << n << std::endl; + std::cout << GridLogMessage << mu << " " << n << std::endl; assert(n<1e-10); } - + std::cout << GridLogMessage << "RectStaple timings orig: " << torig/1000/count << "ms, padded: " << tpadded/1000/count << "ms" << std::endl; + Grid_finalize(); } From 36cc9c524fd75dc904cd02fb0e537ff8cd367c06 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Fri, 23 Jun 2023 09:57:38 -0400 Subject: [PATCH 4/9] Threaded the constructor of GeneralLocalStencil --- Grid/stencil/GeneralLocalStencil.h | 104 ++++++++++++++--------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/Grid/stencil/GeneralLocalStencil.h b/Grid/stencil/GeneralLocalStencil.h index 5e9f2cd6..333f7028 100644 --- a/Grid/stencil/GeneralLocalStencil.h +++ b/Grid/stencil/GeneralLocalStencil.h @@ -79,60 +79,60 @@ public: this->_entries.resize(npoints* osites); this->_entries_p = &_entries[0]; + thread_for(site, osites, { + Coordinate Coor; + Coordinate NbrCoor; - Coordinate Coor; - Coordinate NbrCoor; - for(Integer site=0;siteoCoorFromOindex(Coor,site); - for(int d=0;d_rdimensions[d]; - NbrCoor[d] = (Coor[d] + shifts[ii][d] + rd )%rd; + for(Integer ii=0;iioCoorFromOindex(Coor,site); + for(int d=0;d_rdimensions[d]; + NbrCoor[d] = (Coor[d] + shifts[ii][d] + rd )%rd; + } + SE._offset = grid->oIndexReduced(NbrCoor); + + //////////////////////////////////////////////// + // Inner index permute calculation + // Simpler version using icoor calculation + //////////////////////////////////////////////// + SE._permute =0; + for(int d=0;d_fdimensions[d]; + int rd = grid->_rdimensions[d]; + int ly = grid->_simd_layout[d]; + + assert((ly==1)||(ly==2)); + + int shift = (shifts[ii][d]+fd)%fd; // make it strictly positive 0.. L-1 + int x = Coor[d]; // x in [0... rd-1] as an oSite + + int permute_dim = grid->PermuteDim(d); + int permute_slice=0; + if(permute_dim){ + int num = shift%rd; // Slice within dest osite cell of slice zero + int wrap = shift/rd; // Number of osite local volume cells crossed through + // x+num < rd dictates whether we are in same permute state as slice 0 + if ( x< rd-num ) permute_slice=wrap; + else permute_slice=(wrap+1)%ly; + } + if ( permute_slice ) { + int ptype =grid->PermuteType(d); + uint8_t mask =0x1<_entries[lex] = SE; } - SE._offset = grid->oIndexReduced(NbrCoor); - - //////////////////////////////////////////////// - // Inner index permute calculation - // Simpler version using icoor calculation - //////////////////////////////////////////////// - SE._permute =0; - for(int d=0;d_fdimensions[d]; - int rd = grid->_rdimensions[d]; - int ly = grid->_simd_layout[d]; - - assert((ly==1)||(ly==2)); - - int shift = (shifts[ii][d]+fd)%fd; // make it strictly positive 0.. L-1 - int x = Coor[d]; // x in [0... rd-1] as an oSite - - int permute_dim = grid->PermuteDim(d); - int permute_slice=0; - if(permute_dim){ - int num = shift%rd; // Slice within dest osite cell of slice zero - int wrap = shift/rd; // Number of osite local volume cells crossed through - // x+num < rd dictates whether we are in same permute state as slice 0 - if ( x< rd-num ) permute_slice=wrap; - else permute_slice=(wrap+1)%ly; - } - if ( permute_slice ) { - int ptype =grid->PermuteType(d); - uint8_t mask =0x1<_entries[lex] = SE; - } - } + }); } }; From 4c6613d72cb6f576e8dd01958af372df661fe369 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Mon, 26 Jun 2023 10:20:23 -0400 Subject: [PATCH 5/9] Modified RectStapleDouble and RectStapleOptimised to use Gauge-BC respecting CshiftLink Added test code tests/debug/Test_optimized_staple_gaugebc demonstrating equivalence of above to RectStapleUnoptimised for cconj gauge BCs Removed optimized staple only being used for periodic gauge BCs; it is now always used --- Grid/qcd/utils/WilsonLoops.h | 34 +++---- tests/debug/Test_optimized_staple_gaugebc.cc | 94 ++++++++++++++++++++ 2 files changed, 107 insertions(+), 21 deletions(-) create mode 100644 tests/debug/Test_optimized_staple_gaugebc.cc diff --git a/Grid/qcd/utils/WilsonLoops.h b/Grid/qcd/utils/WilsonLoops.h index d6efbd5d..516158d0 100644 --- a/Grid/qcd/utils/WilsonLoops.h +++ b/Grid/qcd/utils/WilsonLoops.h @@ -707,15 +707,11 @@ public: // the sum over all staples on each site ////////////////////////////////////////////////// static void RectStapleDouble(GaugeMat &U2, const GaugeMat &U, int mu) { - U2 = U * Cshift(U, mu, 1); + U2 = U * CshiftLink(U, mu, 1); } //////////////////////////////////////////////////////////////////////////// - // Hop by two optimisation strategy does not work nicely with Gparity. (could - // do, - // but need to track two deep where cross boundary and apply a conjugation). - // Must differentiate this in Gimpl, and use Gimpl::isPeriodicGaugeField to do - // so . + // Hop by two optimisation strategy. Use RectStapleDouble to obtain 'U2' //////////////////////////////////////////////////////////////////////////// static void RectStapleOptimised(GaugeMat &Stap, std::vector &U2, std::vector &U, int mu) { @@ -732,9 +728,9 @@ public: // Up staple ___ ___ // | | - tmp = Cshift(adj(U[nu]), nu, -1); + tmp = CshiftLink(adj(U[nu]), nu, -1); tmp = adj(U2[mu]) * tmp; - tmp = Cshift(tmp, mu, -2); + tmp = CshiftLink(tmp, mu, -2); Staple2x1 = Gimpl::CovShiftForward(U[nu], nu, tmp); @@ -742,14 +738,14 @@ public: // |___ ___| // tmp = adj(U2[mu]) * U[nu]; - Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, Cshift(tmp, mu, -2)); + Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, CshiftLink(tmp, mu, -2)); // ___ ___ // | ___| // |___ ___| // - Stap += Cshift(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1); + Stap += CshiftLink(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1); // ___ ___ // |___ | @@ -758,7 +754,7 @@ public: // tmp= Staple2x1* Cshift(U[mu],mu,-2); // Stap+= Cshift(tmp,mu,1) ; - Stap += Cshift(Staple2x1, mu, 1) * Cshift(U[mu], mu, -1); + Stap += CshiftLink(Staple2x1, mu, 1) * CshiftLink(U[mu], mu, -1); ; // -- @@ -766,10 +762,10 @@ public: // // | | - tmp = Cshift(adj(U2[nu]), nu, -2); + tmp = CshiftLink(adj(U2[nu]), nu, -2); tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp); - tmp = U2[nu] * Cshift(tmp, nu, 2); - Stap += Cshift(tmp, mu, 1); + tmp = U2[nu] * CshiftLink(tmp, nu, 2); + Stap += CshiftLink(tmp, mu, 1); // | | // @@ -778,8 +774,8 @@ public: tmp = Gimpl::CovShiftBackward(U[mu], mu, U2[nu]); tmp = adj(U2[nu]) * tmp; - tmp = Cshift(tmp, nu, -2); - Stap += Cshift(tmp, mu, 1); + tmp = CshiftLink(tmp, nu, -2); + Stap += CshiftLink(tmp, mu, 1); } } } @@ -790,11 +786,7 @@ public: static void RectStaple(const GaugeLorentz &Umu, GaugeMat &Stap, std::vector &U2, std::vector &U, int mu) { - if (Gimpl::isPeriodicGaugeField()) { - RectStapleOptimised(Stap, U2, U, mu); - } else { - RectStapleUnoptimised(Stap, Umu, mu); - } + RectStapleOptimised(Stap, U2, U, mu); } static void RectStapleUnoptimised(GaugeMat &Stap, const GaugeLorentz &Umu, diff --git a/tests/debug/Test_optimized_staple_gaugebc.cc b/tests/debug/Test_optimized_staple_gaugebc.cc new file mode 100644 index 00000000..63822378 --- /dev/null +++ b/tests/debug/Test_optimized_staple_gaugebc.cc @@ -0,0 +1,94 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_padded_cell_staple.cc + + Copyright (C) 2015 + +Author: Christopher Kelly +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include + +using namespace std; +using namespace Grid; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + Coordinate latt_size = GridDefaultLatt(); + Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + std::cout << " mpi "<({45,12,81,9})); + LatticeGaugeField U(&GRID); + + SU::HotConfiguration(pRNG,U); + + //#define PRD +#ifdef PRD + typedef PeriodicGimplD Gimpl; +#else + typedef ConjugateGimplD Gimpl; + std::vector conj_dirs(Nd,0); conj_dirs[0]=1; conj_dirs[3]=1; + Gimpl::setDirections(conj_dirs); +#endif + + typedef typename WilsonLoops::GaugeMat GaugeMat; + typedef typename WilsonLoops::GaugeLorentz GaugeLorentz; + + int count = 0; + double torig=0, topt=0; + + std::vector Umu(Nd,&GRID), U2(Nd,&GRID); + for(int mu=0;mu(U,mu); + WilsonLoops::RectStapleDouble(U2[mu], Umu[mu], mu); + } + + std::cout << GridLogMessage << "Checking optimized vs unoptimized RectStaple" << std::endl; + for(int mu=0;mu::RectStapleUnoptimised(staple_orig,U,mu); + double t1 = usecond(); + WilsonLoops::RectStapleOptimised(staple_opt, U2, Umu, mu); + double t2 = usecond(); + torig += t1-t0; topt += t2-t1; + ++count; + + GaugeMat diff = staple_orig - staple_opt; + double n = norm2(diff); + std::cout << GridLogMessage << mu << " " << n << std::endl; + assert(n<1e-10); + } + std::cout << GridLogMessage << "RectStaple timings orig: " << torig/1000/count << "ms, optimized: " << topt/1000/count << "ms" << std::endl; + + Grid_finalize(); +} From 6f6844ccf11f2e7db4c8c75cd0edac017e4bb1e0 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Mon, 26 Jun 2023 15:48:47 -0400 Subject: [PATCH 6/9] Added new StapleAll and RectStapleAll functions that return the staples for all mu as an array Modified plaq+rectangle gauge actions to use the above Added a test code to confirm the above changes --- .../action/gauge/PlaqPlusRectangleAction.h | 18 +- Grid/qcd/utils/WilsonLoops.h | 49 +++-- tests/debug/Test_iwasaki_action_newstaple.cc | 182 ++++++++++++++++++ tests/debug/Test_optimized_staple_gaugebc.cc | 2 +- 4 files changed, 225 insertions(+), 26 deletions(-) create mode 100644 tests/debug/Test_iwasaki_action_newstaple.cc diff --git a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h index 7690092d..039d39e0 100644 --- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h +++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h @@ -79,27 +79,19 @@ public: GridBase *grid = Umu.Grid(); std::vector U (Nd,grid); - std::vector U2(Nd,grid); - for(int mu=0;mu(Umu,mu); - WilsonLoops::RectStapleDouble(U2[mu],U[mu],mu); } + std::vector RectStaple(Nd,grid), Staple(Nd,grid); + WilsonLoops::StapleAll(Staple, U); + WilsonLoops::RectStapleAll(RectStaple, U); GaugeLinkField dSdU_mu(grid); GaugeLinkField staple(grid); for (int mu=0; mu < Nd; mu++){ - - // Staple in direction mu - - WilsonLoops::Staple(staple,Umu,mu); - - dSdU_mu = Ta(U[mu]*staple)*factor_p; - - WilsonLoops::RectStaple(Umu,staple,U2,U,mu); - - dSdU_mu = dSdU_mu + Ta(U[mu]*staple)*factor_r; + dSdU_mu = Ta(U[mu]*Staple[mu])*factor_p; + dSdU_mu = dSdU_mu + Ta(U[mu]*RectStaple[mu])*factor_r; PokeIndex(dSdU, dSdU_mu, mu); } diff --git a/Grid/qcd/utils/WilsonLoops.h b/Grid/qcd/utils/WilsonLoops.h index 516158d0..04359ebf 100644 --- a/Grid/qcd/utils/WilsonLoops.h +++ b/Grid/qcd/utils/WilsonLoops.h @@ -290,7 +290,7 @@ public: } */ ////////////////////////////////////////////////// - // the sum over all staples on each site + // the sum over all nu-oriented staples for nu != mu on each site ////////////////////////////////////////////////// static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu) { @@ -300,6 +300,10 @@ public: for (int d = 0; d < Nd; d++) { U[d] = PeekIndex(Umu, d); } + Staple(staple, U, mu); + } + + static void Staple(GaugeMat &staple, const std::vector &U, int mu) { staple = Zero(); for (int nu = 0; nu < Nd; nu++) { @@ -335,6 +339,16 @@ public: } } + ///////////// + //Staples for each direction mu, summed over nu != mu + //staple: output staples for each mu (Nd) + //U: link array (Nd) + ///////////// + static void StapleAll(std::vector &staple, const std::vector &U) { + assert(staple.size() == Nd); assert(U.size() == Nd); + for(int mu=0;mu &U2, - std::vector &U, int mu) { + static void RectStapleOptimised(GaugeMat &Stap, const std::vector &U2, + const std::vector &U, int mu) { Stap = Zero(); @@ -780,15 +794,6 @@ public: } } - static void RectStaple(GaugeMat &Stap, const GaugeLorentz &Umu, int mu) { - RectStapleUnoptimised(Stap, Umu, mu); - } - static void RectStaple(const GaugeLorentz &Umu, GaugeMat &Stap, - std::vector &U2, std::vector &U, - int mu) { - RectStapleOptimised(Stap, U2, U, mu); - } - static void RectStapleUnoptimised(GaugeMat &Stap, const GaugeLorentz &Umu, int mu) { GridBase *grid = Umu.Grid(); @@ -887,6 +892,26 @@ public: } } + static void RectStaple(GaugeMat &Stap, const GaugeLorentz &Umu, int mu) { + RectStapleUnoptimised(Stap, Umu, mu); + } + static void RectStaple(const GaugeLorentz &Umu, GaugeMat &Stap, + std::vector &U2, std::vector &U, + int mu) { + RectStapleOptimised(Stap, U2, U, mu); + } + ////////////////////////////////////////////////////// + //Compute the rectangular staples for all orientations + //Stap : Array of staples (Nd) + //U: Gauge links in each direction (Nd) + ///////////////////////////////////////////////////// + static void RectStapleAll(std::vector &Stap, const std::vector &U){ + assert(Stap.size() == Nd); assert(U.size() == Nd); + std::vector U2(Nd,U[0].Grid()); + for(int mu=0;mu +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +//////////////////////////////////////////////////////////////////////// +// PlaqPlusRectangleActoin +//////////////////////////////////////////////////////////////////////// +template +class PlaqPlusRectangleActionOrig : public Action { +public: + + INHERIT_GIMPL_TYPES(Gimpl); + +private: + RealD c_plaq; + RealD c_rect; + +public: + PlaqPlusRectangleActionOrig(RealD b,RealD c): c_plaq(b),c_rect(c){}; + + virtual std::string action_name(){return "PlaqPlusRectangleActionOrig";} + + virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {}; // noop as no pseudoferms + + virtual std::string LogParameters(){ + std::stringstream sstream; + sstream << GridLogMessage << "["<gSites(); + + RealD plaq = WilsonLoops::avgPlaquette(U); + RealD rect = WilsonLoops::avgRectangle(U); + + RealD action=c_plaq*(1.0 -plaq)*(Nd*(Nd-1.0))*vol*0.5 + +c_rect*(1.0 -rect)*(Nd*(Nd-1.0))*vol; + + return action; + }; + + virtual void deriv(const GaugeField &Umu,GaugeField & dSdU) { + //extend Ta to include Lorentz indexes + RealD factor_p = c_plaq/RealD(Nc)*0.5; + RealD factor_r = c_rect/RealD(Nc)*0.5; + + GridBase *grid = Umu.Grid(); + + std::vector U (Nd,grid); + std::vector U2(Nd,grid); + + for(int mu=0;mu(Umu,mu); + WilsonLoops::RectStapleDouble(U2[mu],U[mu],mu); + } + + GaugeLinkField dSdU_mu(grid); + GaugeLinkField staple(grid); + + for (int mu=0; mu < Nd; mu++){ + + // Staple in direction mu + + WilsonLoops::Staple(staple,Umu,mu); + + dSdU_mu = Ta(U[mu]*staple)*factor_p; + + WilsonLoops::RectStaple(Umu,staple,U2,U,mu); + + dSdU_mu = dSdU_mu + Ta(U[mu]*staple)*factor_r; + + PokeIndex(dSdU, dSdU_mu, mu); + } + + }; + +}; + +// Convenience for common physically defined cases. +// +// RBC c1 parameterisation is not really RBC but don't have good +// reference and we are happy to change name if prior use of this plaq coeff +// parameterisation is made known to us. +template +class RBCGaugeActionOrig : public PlaqPlusRectangleActionOrig { +public: + INHERIT_GIMPL_TYPES(Gimpl); + RBCGaugeActionOrig(RealD beta,RealD c1) : PlaqPlusRectangleActionOrig(beta*(1.0-8.0*c1), beta*c1) {}; + virtual std::string action_name(){return "RBCGaugeActionOrig";} +}; + +template +class IwasakiGaugeActionOrig : public RBCGaugeActionOrig { +public: + INHERIT_GIMPL_TYPES(Gimpl); + IwasakiGaugeActionOrig(RealD beta) : RBCGaugeActionOrig(beta,-0.331) {}; + virtual std::string action_name(){return "IwasakiGaugeActionOrig";} +}; + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + Coordinate latt_size = GridDefaultLatt(); + Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + std::cout << " mpi "<({45,12,81,9})); + LatticeGaugeField U(&GRID); + + SU::HotConfiguration(pRNG,U); + + //#define PRD +#ifdef PRD + typedef PeriodicGimplD Gimpl; +#else + typedef ConjugateGimplD Gimpl; + std::vector conj_dirs(Nd,0); conj_dirs[0]=1; conj_dirs[3]=1; + Gimpl::setDirections(conj_dirs); +#endif + + typedef typename WilsonLoops::GaugeMat GaugeMat; + typedef typename WilsonLoops::GaugeLorentz GaugeLorentz; + + GaugeLorentz derivOrig(&GRID), derivNew(&GRID); + double beta = 2.13; + IwasakiGaugeActionOrig action_orig(beta); + IwasakiGaugeAction action_new(beta); + + double t0 = usecond(); + action_orig.deriv(U, derivOrig); + double t1 = usecond(); + action_new.deriv(U, derivNew); + double t2 = usecond(); + + GaugeLorentz diff = derivOrig - derivNew; + double n = norm2(diff); + std::cout << GridLogMessage << "Difference " << n << " (expect 0)" << std::endl; + assert(n<1e-10); + + std::cout << GridLogMessage << "Timings orig: " << (t1-t0)/1000 << "ms, new: " << (t2-t1)/1000 << "ms" << std::endl; + + Grid_finalize(); +} diff --git a/tests/debug/Test_optimized_staple_gaugebc.cc b/tests/debug/Test_optimized_staple_gaugebc.cc index 63822378..51628ab0 100644 --- a/tests/debug/Test_optimized_staple_gaugebc.cc +++ b/tests/debug/Test_optimized_staple_gaugebc.cc @@ -2,7 +2,7 @@ Grid physics library, www.github.com/paboyle/Grid - Source file: ./tests/Test_padded_cell_staple.cc + Source file: ./tests/Test_optimized_staple_gaugebc.cc Copyright (C) 2015 From bb71e9a96a14430075219bd077cb90648b2258b4 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Tue, 27 Jun 2023 11:23:30 -0400 Subject: [PATCH 7/9] Added PaddedCell and GeneralisedLocalStencil header includes to standard base headers Moved versions of the padded-cell implementations of staple and rect-staple from test code to WilsonLoops header Added StapleAndRectStapleAll which is now called by the plaq+rectangle action class. Under the hood it uses the padded cell implementations with maximal reuse of the padded gauge links --- Grid/lattice/Lattice.h | 1 + Grid/lattice/PaddedCell.h | 2 + .../action/gauge/PlaqPlusRectangleAction.h | 3 +- Grid/qcd/utils/WilsonLoops.h | 300 +++++++++++++++++- Grid/stencil/Stencil.h | 1 + 5 files changed, 304 insertions(+), 3 deletions(-) diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h index c4adf86a..6343db99 100644 --- a/Grid/lattice/Lattice.h +++ b/Grid/lattice/Lattice.h @@ -47,3 +47,4 @@ Author: Peter Boyle #include #include #include +#include diff --git a/Grid/lattice/PaddedCell.h b/Grid/lattice/PaddedCell.h index 2efd2288..5cba3355 100644 --- a/Grid/lattice/PaddedCell.h +++ b/Grid/lattice/PaddedCell.h @@ -26,6 +26,8 @@ Author: Peter Boyle pboyle@bnl.gov /* END LEGAL */ #pragma once +#include + NAMESPACE_BEGIN(Grid); //Allow the user to specify how the C-shift is performed, e.g. to respect the appropriate boundary conditions diff --git a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h index 039d39e0..68eb0a67 100644 --- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h +++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h @@ -83,8 +83,7 @@ public: U[mu] = PeekIndex(Umu,mu); } std::vector RectStaple(Nd,grid), Staple(Nd,grid); - WilsonLoops::StapleAll(Staple, U); - WilsonLoops::RectStapleAll(RectStaple, U); + WilsonLoops::StapleAndRectStapleAll(Staple, RectStaple, U); GaugeLinkField dSdU_mu(grid); GaugeLinkField staple(grid); diff --git a/Grid/qcd/utils/WilsonLoops.h b/Grid/qcd/utils/WilsonLoops.h index 04359ebf..c3abb0ec 100644 --- a/Grid/qcd/utils/WilsonLoops.h +++ b/Grid/qcd/utils/WilsonLoops.h @@ -348,7 +348,91 @@ public: assert(staple.size() == Nd); assert(U.size() == Nd); for(int mu=0;mu &staple, const std::vector &U_padded, const PaddedCell &Cell) { + assert(U_padded.size() == Nd); assert(staple.size() == Nd); + assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back()); + assert(Cell.depth >= 1); + GridBase *ggrid = U_padded[0].Grid(); //padded cell grid + + //Generate shift arrays + std::vector shifts; + for(int mu=0;muoSites(), ggrid->Nsimd(), { + auto stencil_ss = coalescedRead(rgStaple_v[ss]); + + GeneralStencilEntry const* e = gStencil_v.GetEntry(off,ss); + auto Udag_nu_x = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(off+1,ss); + auto Udag_mu_xpnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(off+2,ss); + auto U_nu_xpmu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd); + + stencil_ss = stencil_ss + U_nu_xpmu * Udag_mu_xpnu * Udag_nu_x; + + e = gStencil_v.GetEntry(off+3,ss); + auto U_nu_xmnu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(off+4,ss); + auto Udag_mu_xmnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(off+5,ss); + auto Udag_nu_xmnu_pmu = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd)); + + stencil_ss = stencil_ss + Udag_nu_xmnu_pmu * Udag_mu_xmnu * U_nu_xmnu; + + coalescedWrite(gStaple_v[ss],stencil_ss); + } + ); + } //ensure views are all closed! + off += 6; + }//nu!=mu + }//nu loop + staple[mu] = Cell.Extract(gStaple); + }//mu loop + + } + + ////////////////////////////////////////////////// // the sum over all staples on each site in direction mu,nu, upper part ////////////////////////////////////////////////// @@ -912,6 +996,220 @@ public: for(int mu=0;mu &staple, const std::vector &U_padded, const PaddedCell &Cell) { + assert(U_padded.size() == Nd); assert(staple.size() == Nd); + assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back()); + assert(Cell.depth >= 2); + GridBase *ggrid = U_padded[0].Grid(); //padded cell grid + + std::vector shifts; + for (int mu = 0; mu < Nd; mu++){ + for (int nu = 0; nu < Nd; nu++) { + if (nu != mu) { + auto genShift = [&](int mushift,int nushift){ + Coordinate out(Nd,0); out[mu]=mushift; out[nu]=nushift; return out; + }; + + //tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x) + shifts.push_back(genShift(0,0)); + shifts.push_back(genShift(0,+1)); + shifts.push_back(genShift(+1,+1)); + shifts.push_back(genShift(+2,0)); + shifts.push_back(genShift(+1,0)); + + //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu) + shifts.push_back(genShift(0,-1)); + shifts.push_back(genShift(0,-1)); + shifts.push_back(genShift(+1,-1)); + shifts.push_back(genShift(+2,-1)); + shifts.push_back(genShift(+1,0)); + + //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu) + shifts.push_back(genShift(-1,0)); + shifts.push_back(genShift(-1,-1)); + shifts.push_back(genShift(-1,-1)); + shifts.push_back(genShift(0,-1)); + shifts.push_back(genShift(+1,-1)); + + //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu) + shifts.push_back(genShift(-1,0)); + shifts.push_back(genShift(-1,0)); + shifts.push_back(genShift(-1,+1)); + shifts.push_back(genShift(0,+1)); + shifts.push_back(genShift(+1,0)); + + //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x) + shifts.push_back(genShift(0,0)); + shifts.push_back(genShift(0,+1)); + shifts.push_back(genShift(0,+2)); + shifts.push_back(genShift(+1,+1)); + shifts.push_back(genShift(+1,0)); + + //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu) + shifts.push_back(genShift(0,-1)); + shifts.push_back(genShift(0,-2)); + shifts.push_back(genShift(0,-2)); + shifts.push_back(genShift(+1,-2)); + shifts.push_back(genShift(+1,-1)); + } + } + } + size_t nshift = shifts.size(); + int mu_off_delta = nshift / Nd; + GeneralLocalStencil gStencil(ggrid,shifts); + + //Open views to padded gauge links and keep open over mu loop + typedef LatticeView GaugeViewType; + size_t vsize = Nd*sizeof(GaugeViewType); + GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize); + for(int i=0;ioSites(), ggrid->Nsimd(), { + decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss; + stencil_ss = Zero(); + int s=offset; + for(int nu=0;nu_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + auto U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + auto U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + auto U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x) + e = gStencil_v.GetEntry(s++,ss); + U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + } + } + coalescedWrite(gStaple_v[ss],stencil_ss); + } + ); + offset += mu_off_delta; + }//kernel/view scope + + staple[mu] = Cell.Extract(gStaple); + }//mu loop + + for(int i=0;i &Stap, std::vector &RectStap, const std::vector &U){ +#if 0 + StapleAll(Stap, U); + RectStapleAll(RectStap, U); +#else + //Use the padded cell with maximal reuse + PaddedCell Ghost(2, dynamic_cast(U[0].Grid())); + CshiftImplGauge cshift_impl; + std::vector U_pad(Nd, Ghost.grids.back()); + for(int mu=0;mu // subdir aggregate #include // subdir aggregate +#include ////////////////////////////////////////////////////////////////////////////////////////// // Must not lose sight that goal is to be able to construct really efficient From f44dce390f3400ea5e40c16b46ff4c7ebd1357fc Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Tue, 27 Jun 2023 14:58:10 -0400 Subject: [PATCH 8/9] Implemented acclerator-optimized versions of localCopyRegion and insertSliceLocal to speed up padding Fixed const correctness on PaddedCell methods Fixed compile issues on Crusher Added timing breakdowns for PaddedCell::Expand and the padded implementations of the staples, visible under --log Performance Optimized kernel for StaplePadded Test_iwasaki_action_newstaple now repeats the calculation 10 times and reports average timings --- Grid/lattice/Lattice_transfer.h | 126 +++++++++++++++++- Grid/lattice/PaddedCell.h | 26 +++- Grid/qcd/utils/WilsonLoops.h | 129 ++++++++++++------- tests/debug/Test_iwasaki_action_newstaple.cc | 26 ++-- 4 files changed, 245 insertions(+), 62 deletions(-) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 04540d88..668ef4b4 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -697,8 +697,68 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro for(int d=0;d_processors[d] == Tg->_processors[d]); } - // the above should guarantee that the operations are local + +#if 1 + + size_t nsite = 1; + for(int i=0;ioIndex(from_coor); + int fiidx = Fg->iIndex(from_coor); + int toidx = Tg->oIndex(to_coor); + int tiidx = Tg->iIndex(to_coor); + int* tt = table + 4*idx; + tt[0] = foidx; + tt[1] = fiidx; + tt[2] = toidx; + tt[3] = tiidx; + }); + + int* table_d = (int*)acceleratorAllocDevice(tbytes); + acceleratorCopyToDevice(table,table_d,tbytes); + + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; + + autoView(from_v,From,AcceleratorRead); + autoView(to_v,To,AcceleratorWrite); + + accelerator_for(idx,nsite,1,{ + static const int words=sizeof(vobj)/sizeof(vector_type); + int* tt = table_d + 4*idx; + int from_oidx = *tt++; + int from_lane = *tt++; + int to_oidx = *tt++; + int to_lane = *tt; + + const vector_type* from = (const vector_type *)&from_v[from_oidx]; + vector_type* to = (vector_type *)&to_v[to_oidx]; + + scalar_type stmp; + for(int w=0;w_ldimensions; Coordinate rdf = Fg->_rdimensions; Coordinate isf = Fg->_istride; @@ -738,6 +798,8 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro #endif } }); + +#endif } @@ -830,6 +892,8 @@ void ExtractSlice(Lattice &lowDim,const Lattice & higherDim,int slic } +//Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim +//The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same template void InsertSliceLocal(const Lattice &lowDim, Lattice & higherDim,int slice_lo,int slice_hi, int orthog) { @@ -851,6 +915,65 @@ void InsertSliceLocal(const Lattice &lowDim, Lattice & higherDim,int } } +#if 1 + size_t nsite = lg->lSites()/lg->LocalDimensions()[orthog]; + size_t tbytes = 4*nsite*sizeof(int); + int *table = (int*)malloc(tbytes); + + thread_for(idx,nsite,{ + Coordinate lcoor(nl); + Coordinate hcoor(nh); + lcoor[orthog] = slice_lo; + hcoor[orthog] = slice_hi; + size_t rem = idx; + for(int mu=0;muLocalDimensions()[mu]; rem /= lg->LocalDimensions()[mu]; + lcoor[mu] = hcoor[mu] = xmu; + } + } + int loidx = lg->oIndex(lcoor); + int liidx = lg->iIndex(lcoor); + int hoidx = hg->oIndex(hcoor); + int hiidx = hg->iIndex(hcoor); + int* tt = table + 4*idx; + tt[0] = loidx; + tt[1] = liidx; + tt[2] = hoidx; + tt[3] = hiidx; + }); + + int* table_d = (int*)acceleratorAllocDevice(tbytes); + acceleratorCopyToDevice(table,table_d,tbytes); + + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; + + autoView(lowDim_v,lowDim,AcceleratorRead); + autoView(higherDim_v,higherDim,AcceleratorWrite); + + accelerator_for(idx,nsite,1,{ + static const int words=sizeof(vobj)/sizeof(vector_type); + int* tt = table_d + 4*idx; + int from_oidx = *tt++; + int from_lane = *tt++; + int to_oidx = *tt++; + int to_lane = *tt; + + const vector_type* from = (const vector_type *)&lowDim_v[from_oidx]; + vector_type* to = (vector_type *)&higherDim_v[to_oidx]; + + scalar_type stmp; + for(int w=0;w &lowDim, Lattice & higherDim,int pokeLocalSite(s,higherDimv,hcoor); } }); +#endif } diff --git a/Grid/lattice/PaddedCell.h b/Grid/lattice/PaddedCell.h index 5cba3355..5dfb6b0f 100644 --- a/Grid/lattice/PaddedCell.h +++ b/Grid/lattice/PaddedCell.h @@ -95,7 +95,7 @@ public: } }; template - inline Lattice Extract(const Lattice &in) + inline Lattice Extract(const Lattice &in) const { Lattice out(unpadded_grid); @@ -106,7 +106,7 @@ public: return out; } template - inline Lattice Exchange(const Lattice &in, const CshiftImplBase &cshift = CshiftImplDefault()) + inline Lattice Exchange(const Lattice &in, const CshiftImplBase &cshift = CshiftImplDefault()) const { GridBase *old_grid = in.Grid(); int dims = old_grid->Nd(); @@ -118,7 +118,7 @@ public: } // expand up one dim at a time template - inline Lattice Expand(int dim, const Lattice &in, const CshiftImplBase &cshift = CshiftImplDefault()) + inline Lattice Expand(int dim, const Lattice &in, const CshiftImplBase &cshift = CshiftImplDefault()) const { GridBase *old_grid = in.Grid(); GridCartesian *new_grid = grids[dim];//These are new grids @@ -130,20 +130,40 @@ public: else conformable(old_grid,grids[dim-1]); std::cout << " dim "<= 1); GridBase *ggrid = U_padded[0].Grid(); //padded cell grid + double t0 = usecond(); //Generate shift arrays std::vector shifts; for(int mu=0;mu GaugeViewType; + size_t vsize = Nd*sizeof(GaugeViewType); + GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize); + for(int i=0;ioSites(), ggrid->Nsimd(), { - auto stencil_ss = coalescedRead(rgStaple_v[ss]); - - GeneralStencilEntry const* e = gStencil_v.GetEntry(off,ss); - auto Udag_nu_x = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(off+1,ss); - auto Udag_mu_xpnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(off+2,ss); - auto U_nu_xpmu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd); + { //view scope + autoView( gStaple_v , gStaple, AcceleratorWrite); + auto gStencil_v = gStencil.View(); + + accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), { + decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss; + stencil_ss = Zero(); + int off = outer_off; + + for(int nu=0;nu_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(off++,ss); + auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(off++,ss); + auto U2 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); - stencil_ss = stencil_ss + U_nu_xpmu * Udag_mu_xpnu * Udag_nu_x; + stencil_ss = stencil_ss + U2 * U1 * U0; - e = gStencil_v.GetEntry(off+3,ss); - auto U_nu_xmnu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd); - e = gStencil_v.GetEntry(off+4,ss); - auto Udag_mu_xmnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(off+5,ss); - auto Udag_nu_xmnu_pmu = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(off++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(off++,ss); + U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(off++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); - stencil_ss = stencil_ss + Udag_nu_xmnu_pmu * Udag_mu_xmnu * U_nu_xmnu; - - coalescedWrite(gStaple_v[ss],stencil_ss); + stencil_ss = stencil_ss + U2 * U1 * U0; } - ); - } //ensure views are all closed! - off += 6; - }//nu!=mu - }//nu loop + } + + coalescedWrite(gStaple_v[ss],stencil_ss); + } + ); + } //ensure views are all closed! + staple[mu] = Cell.Extract(gStaple); + outer_off += shift_mu_off; }//mu loop + for(int i=0;i= 2); GridBase *ggrid = U_padded[0].Grid(); //padded cell grid + double t0 = usecond(); std::vector shifts; for (int mu = 0; mu < Nd; mu++){ for (int nu = 0; nu < Nd; nu++) { @@ -1060,8 +1081,11 @@ public: } size_t nshift = shifts.size(); int mu_off_delta = nshift / Nd; + double t1 = usecond(); + GeneralLocalStencil gStencil(ggrid,shifts); - + double t2 = usecond(); + //Open views to padded gauge links and keep open over mu loop typedef LatticeView GaugeViewType; size_t vsize = Nd*sizeof(GaugeViewType); @@ -1183,6 +1207,10 @@ public: for(int i=0;i(U[0].Grid())); CshiftImplGauge cshift_impl; std::vector U_pad(Nd, Ghost.grids.back()); for(int mu=0;mu action_orig(beta); IwasakiGaugeAction action_new(beta); - double t0 = usecond(); - action_orig.deriv(U, derivOrig); - double t1 = usecond(); - action_new.deriv(U, derivNew); - double t2 = usecond(); + double torig=0, tnew=0; + int ntest = 10; + for(int i=0;i Date: Wed, 28 Jun 2023 15:11:24 -0400 Subject: [PATCH 9/9] The stencils for the staple and rect-staple padded cell implementations are now created and stored by workspace classes that allow for reuse providing the grids remain consistent The workspaces are now used by the plaq+rectangle gauge action resulting in a further 2x performance improvement as measured on a 16^4 local volume for 2 nodes (16 ranks) of Crusher --- .../action/gauge/PlaqPlusRectangleAction.h | 4 +- Grid/qcd/utils/WilsonLoops.h | 319 ++++++++++++------ 2 files changed, 220 insertions(+), 103 deletions(-) diff --git a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h index 68eb0a67..b9d6ac16 100644 --- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h +++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h @@ -43,7 +43,7 @@ public: private: RealD c_plaq; RealD c_rect; - + typename WilsonLoops::StapleAndRectStapleAllWorkspace workspace; public: PlaqPlusRectangleAction(RealD b,RealD c): c_plaq(b),c_rect(c){}; @@ -83,7 +83,7 @@ public: U[mu] = PeekIndex(Umu,mu); } std::vector RectStaple(Nd,grid), Staple(Nd,grid); - WilsonLoops::StapleAndRectStapleAll(Staple, RectStaple, U); + WilsonLoops::StapleAndRectStapleAll(Staple, RectStaple, U, workspace); GaugeLinkField dSdU_mu(grid); GaugeLinkField staple(grid); diff --git a/Grid/qcd/utils/WilsonLoops.h b/Grid/qcd/utils/WilsonLoops.h index 731b755c..78e25a8d 100644 --- a/Grid/qcd/utils/WilsonLoops.h +++ b/Grid/qcd/utils/WilsonLoops.h @@ -349,47 +349,129 @@ public: for(int mu=0;mu stencil; + size_t nshift; + + void generateStencil(GridBase* padded_grid){ + double t0 = usecond(); + + //Generate shift arrays + std::vector shifts = this->getShifts(); + nshift = shifts.size(); + + double t1 = usecond(); + //Generate local stencil + stencil.reset(new GeneralLocalStencil(padded_grid,shifts)); + double t2 = usecond(); + std::cout << GridLogPerformance << " WilsonLoopPaddedWorkspace timings: coord:" << (t1-t0)/1000 << "ms, stencil:" << (t2-t1)/1000 << "ms" << std::endl; + } + public: + //Get the stencil. If not already generated, or if generated using a different Grid than in PaddedCell, it will be created on-the-fly + const GeneralLocalStencil & getStencil(const PaddedCell &pcell){ + assert(pcell.depth >= this->paddingDepth()); + if(!stencil || stencil->Grid() != (GridBase*)pcell.grids.back() ) generateStencil((GridBase*)pcell.grids.back()); + return *stencil; + } + size_t Nshift() const{ return nshift; } + + virtual std::vector getShifts() const = 0; + virtual int paddingDepth() const = 0; //padding depth required + + virtual ~WilsonLoopPaddedStencilWorkspace(){} + }; + + //This workspace allows the sharing of a common PaddedCell object between multiple stencil workspaces + class WilsonLoopPaddedWorkspace{ + std::vector stencil_wk; + std::unique_ptr pcell; + + void generatePcell(GridBase* unpadded_grid){ + assert(stencil_wk.size()); + int max_depth = 0; + for(auto const &s : stencil_wk) max_depth=std::max(max_depth, s->paddingDepth()); + + pcell.reset(new PaddedCell(max_depth, dynamic_cast(unpadded_grid))); + } + + public: + //Add a stencil definition. This should be done before the first call to retrieve a stencil object. + //Takes ownership of the pointer + void addStencil(WilsonLoopPaddedStencilWorkspace *stencil){ + assert(!pcell); + stencil_wk.push_back(stencil); + } + + const GeneralLocalStencil & getStencil(const size_t stencil_idx, GridBase* unpadded_grid){ + if(!pcell || pcell->unpadded_grid != unpadded_grid) generatePcell(unpadded_grid); + return stencil_wk[stencil_idx]->getStencil(*pcell); + } + const PaddedCell & getPaddedCell(GridBase* unpadded_grid){ + if(!pcell || pcell->unpadded_grid != unpadded_grid) generatePcell(unpadded_grid); + return *pcell; + } + + ~WilsonLoopPaddedWorkspace(){ + for(auto &s : stencil_wk) delete s; + } + }; + + //A workspace class allowing reuse of the stencil + class StaplePaddedAllWorkspace: public WilsonLoopPaddedStencilWorkspace{ + public: + std::vector getShifts() const override{ + std::vector shifts; + for(int mu=0;mu &staple, const std::vector &U_padded, const PaddedCell &Cell) { + StaplePaddedAllWorkspace wk; + StaplePaddedAll(staple,U_padded,Cell,wk.getStencil(Cell)); + } + + //Padded cell implementation of the staple method for all mu, summed over nu != mu + //staple: output staple for each mu, summed over nu != mu (Nd) + //U_padded: the gauge link fields padded out using the PaddedCell class + //Cell: the padded cell class + //gStencil: the precomputed generalized local stencil for the staple + static void StaplePaddedAll(std::vector &staple, const std::vector &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) { + double t0 = usecond(); assert(U_padded.size() == Nd); assert(staple.size() == Nd); assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back()); assert(Cell.depth >= 1); GridBase *ggrid = U_padded[0].Grid(); //padded cell grid - double t0 = usecond(); - //Generate shift arrays - std::vector shifts; - for(int mu=0;mu GaugeViewType; @@ -447,9 +529,9 @@ public: free(Ug_dirs_v_host); acceleratorFreeDevice(Ug_dirs_v); - double t3=usecond(); + double t1=usecond(); - std::cout << GridLogPerformance << "StaplePaddedAll timings: coord:" << (t1-t0)/1000 << "ms, stencil:" << (t2-t1)/1000 << "ms, kernel:" << (t3-t2)/1000 << "ms" << std::endl; + std::cout << GridLogPerformance << "StaplePaddedAll timing:" << (t1-t0)/1000 << "ms" << std::endl; } @@ -1016,75 +1098,91 @@ public: for(int mu=0;mu getShifts() const override{ + std::vector shifts; + for (int mu = 0; mu < Nd; mu++){ + for (int nu = 0; nu < Nd; nu++) { + if (nu != mu) { + auto genShift = [&](int mushift,int nushift){ + Coordinate out(Nd,0); out[mu]=mushift; out[nu]=nushift; return out; + }; + + //tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x) + shifts.push_back(genShift(0,0)); + shifts.push_back(genShift(0,+1)); + shifts.push_back(genShift(+1,+1)); + shifts.push_back(genShift(+2,0)); + shifts.push_back(genShift(+1,0)); + + //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu) + shifts.push_back(genShift(0,-1)); + shifts.push_back(genShift(0,-1)); + shifts.push_back(genShift(+1,-1)); + shifts.push_back(genShift(+2,-1)); + shifts.push_back(genShift(+1,0)); + + //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu) + shifts.push_back(genShift(-1,0)); + shifts.push_back(genShift(-1,-1)); + shifts.push_back(genShift(-1,-1)); + shifts.push_back(genShift(0,-1)); + shifts.push_back(genShift(+1,-1)); + + //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu) + shifts.push_back(genShift(-1,0)); + shifts.push_back(genShift(-1,0)); + shifts.push_back(genShift(-1,+1)); + shifts.push_back(genShift(0,+1)); + shifts.push_back(genShift(+1,0)); + + //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x) + shifts.push_back(genShift(0,0)); + shifts.push_back(genShift(0,+1)); + shifts.push_back(genShift(0,+2)); + shifts.push_back(genShift(+1,+1)); + shifts.push_back(genShift(+1,0)); + + //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu) + shifts.push_back(genShift(0,-1)); + shifts.push_back(genShift(0,-2)); + shifts.push_back(genShift(0,-2)); + shifts.push_back(genShift(+1,-2)); + shifts.push_back(genShift(+1,-1)); + } + } + } + return shifts; + } + + int paddingDepth() const override{ return 2; } + }; + //Padded cell implementation of the rectangular staple method for all mu, summed over nu != mu //staple: output staple for each mu, summed over nu != mu (Nd) //U_padded: the gauge link fields padded out using the PaddedCell class //Cell: the padded cell class static void RectStaplePaddedAll(std::vector &staple, const std::vector &U_padded, const PaddedCell &Cell) { + RectStaplePaddedAllWorkspace wk; + RectStaplePaddedAll(staple,U_padded,Cell,wk.getStencil(Cell)); + } + + //Padded cell implementation of the rectangular staple method for all mu, summed over nu != mu + //staple: output staple for each mu, summed over nu != mu (Nd) + //U_padded: the gauge link fields padded out using the PaddedCell class + //Cell: the padded cell class + //gStencil: the stencil + static void RectStaplePaddedAll(std::vector &staple, const std::vector &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) { + double t0 = usecond(); assert(U_padded.size() == Nd); assert(staple.size() == Nd); assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back()); assert(Cell.depth >= 2); GridBase *ggrid = U_padded[0].Grid(); //padded cell grid - double t0 = usecond(); - std::vector shifts; - for (int mu = 0; mu < Nd; mu++){ - for (int nu = 0; nu < Nd; nu++) { - if (nu != mu) { - auto genShift = [&](int mushift,int nushift){ - Coordinate out(Nd,0); out[mu]=mushift; out[nu]=nushift; return out; - }; - - //tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x) - shifts.push_back(genShift(0,0)); - shifts.push_back(genShift(0,+1)); - shifts.push_back(genShift(+1,+1)); - shifts.push_back(genShift(+2,0)); - shifts.push_back(genShift(+1,0)); - - //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu) - shifts.push_back(genShift(0,-1)); - shifts.push_back(genShift(0,-1)); - shifts.push_back(genShift(+1,-1)); - shifts.push_back(genShift(+2,-1)); - shifts.push_back(genShift(+1,0)); - - //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu) - shifts.push_back(genShift(-1,0)); - shifts.push_back(genShift(-1,-1)); - shifts.push_back(genShift(-1,-1)); - shifts.push_back(genShift(0,-1)); - shifts.push_back(genShift(+1,-1)); - - //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu) - shifts.push_back(genShift(-1,0)); - shifts.push_back(genShift(-1,0)); - shifts.push_back(genShift(-1,+1)); - shifts.push_back(genShift(0,+1)); - shifts.push_back(genShift(+1,0)); - - //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x) - shifts.push_back(genShift(0,0)); - shifts.push_back(genShift(0,+1)); - shifts.push_back(genShift(0,+2)); - shifts.push_back(genShift(+1,+1)); - shifts.push_back(genShift(+1,0)); - - //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu) - shifts.push_back(genShift(0,-1)); - shifts.push_back(genShift(0,-2)); - shifts.push_back(genShift(0,-2)); - shifts.push_back(genShift(+1,-2)); - shifts.push_back(genShift(+1,-1)); - } - } - } - size_t nshift = shifts.size(); + size_t nshift = gStencil._npoints; int mu_off_delta = nshift / Nd; - double t1 = usecond(); - - GeneralLocalStencil gStencil(ggrid,shifts); - double t2 = usecond(); //Open views to padded gauge links and keep open over mu loop typedef LatticeView GaugeViewType; @@ -1208,13 +1306,20 @@ public: free(Ug_dirs_v_host); acceleratorFreeDevice(Ug_dirs_v); - double t3 = usecond(); + double t1 = usecond(); - std::cout << GridLogPerformance << "RectStaplePaddedAll timings: coord:" << (t1-t0)/1000 << "ms, stencil:" << (t2-t1)/1000 << "ms, kernel:" << (t3-t2)/1000 << "ms" << std::endl; + std::cout << GridLogPerformance << "RectStaplePaddedAll timings:" << (t1-t0)/1000 << "ms" << std::endl; } - - + //A workspace for reusing the PaddedCell and GeneralLocalStencil objects + class StapleAndRectStapleAllWorkspace: public WilsonLoopPaddedWorkspace{ + public: + StapleAndRectStapleAllWorkspace(){ + this->addStencil(new StaplePaddedAllWorkspace); + this->addStencil(new RectStaplePaddedAllWorkspace); + } + }; + ////////////////////////////////////////////////////// //Compute the 1x1 and 1x2 staples for all orientations //Stap : Array of staples (Nd) @@ -1222,27 +1327,39 @@ public: //U: Gauge links in each direction (Nd) ///////////////////////////////////////////////////// static void StapleAndRectStapleAll(std::vector &Stap, std::vector &RectStap, const std::vector &U){ + StapleAndRectStapleAllWorkspace wk; + StapleAndRectStapleAll(Stap,RectStap,U,wk); + } + + ////////////////////////////////////////////////////// + //Compute the 1x1 and 1x2 staples for all orientations + //Stap : Array of staples (Nd) + //RectStap: Array of rectangular staples (Nd) + //U: Gauge links in each direction (Nd) + //wk: a workspace containing stored PaddedCell and GeneralLocalStencil objects to maximize reuse + ///////////////////////////////////////////////////// + static void StapleAndRectStapleAll(std::vector &Stap, std::vector &RectStap, const std::vector &U, StapleAndRectStapleAllWorkspace &wk){ #if 0 StapleAll(Stap, U); RectStapleAll(RectStap, U); #else double t0 = usecond(); - //Use the padded cell with maximal reuse - PaddedCell Ghost(2, dynamic_cast(U[0].Grid())); + + GridCartesian* unpadded_grid = dynamic_cast(U[0].Grid()); + const PaddedCell &Ghost = wk.getPaddedCell(unpadded_grid); + CshiftImplGauge cshift_impl; std::vector U_pad(Nd, Ghost.grids.back()); for(int mu=0;mu