From 4241c7d4a3c5151196102ee865491503af7ae05b Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Wed, 21 Jun 2023 16:01:01 -0400 Subject: [PATCH] Imported coalescedReadGeneralPermute GPU implementation from Christoph Fixed bug in padded staple code where extract was being called on the result before the GPU view was closed Fixed compile issue with pointer cast in padded staple code Added timing summaries of padded staple code and timing breakdown of staple implementation to Test_padded_cell_staple --- Grid/tensors/Tensor_SIMT.h | 10 +- tests/debug/Test_padded_cell_staple.cc | 306 ++++++++++++++----------- 2 files changed, 185 insertions(+), 131 deletions(-) diff --git a/Grid/tensors/Tensor_SIMT.h b/Grid/tensors/Tensor_SIMT.h index 715735c9..9be2c500 100644 --- a/Grid/tensors/Tensor_SIMT.h +++ b/Grid/tensors/Tensor_SIMT.h @@ -93,7 +93,7 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ { vstream(vec, extracted); } -#else +#else //==GRID_SIMT //#ifndef GRID_SYCL @@ -176,6 +176,14 @@ typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec, return extractLane(plane,vec); } template accelerator_inline +typename vobj::scalar_object coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=acceleratorSIMTlane(vobj::Nsimd())) +{ + int plane = lane; + for (int d=0;d> (d + 1)) : plane; + return extractLane(plane,vec); +} +template accelerator_inline void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=acceleratorSIMTlane(vobj::Nsimd())) { insertLane(lane,vec,extracted); diff --git a/tests/debug/Test_padded_cell_staple.cc b/tests/debug/Test_padded_cell_staple.cc index bed00cad..6d7bc8c9 100644 --- a/tests/debug/Test_padded_cell_staple.cc +++ b/tests/debug/Test_padded_cell_staple.cc @@ -102,18 +102,31 @@ public: staple = Zero(); return; } - PaddedCell Ghost(1,U.Grid()); + double peek = 0, construct = 0, exchange = 0, coord = 0, stencil =0, kernel = 0, extract = 0, total = 0; + + double tstart = usecond(); + double t=tstart; + + PaddedCell Ghost(1, (GridCartesian*)U.Grid()); + construct += usecond() - t; + + t=usecond(); GaugeMat U_mu = PeekIndex(U, mu); GaugeMat U_nu = PeekIndex(U, nu); + peek += usecond() - t; + t=usecond(); CshiftImplGauge cshift_impl; GaugeMat Ug_mu = Ghost.Exchange(U_mu, cshift_impl); GaugeMat Ug_nu = Ghost.Exchange(U_nu, cshift_impl); + exchange += usecond() - t; + GridBase *ggrid = Ug_mu.Grid(); GaugeMat gStaple(ggrid); + t=usecond(); Coordinate shift_0(Nd,0); Coordinate shift_mu(Nd,0); shift_mu[mu]=1; Coordinate shift_nu(Nd,0); shift_nu[nu]=1; @@ -131,37 +144,50 @@ public: shifts.push_back(shift_mnu); shifts.push_back(shift_mnu); shifts.push_back(shift_mnu_pmu); + coord += usecond()-t; + t=usecond(); GeneralLocalStencil gStencil(ggrid,shifts); - autoView( gStaple_v , gStaple, AcceleratorWrite); - auto gStencil_v = gStencil.View(); - autoView( Ug_mu_v , Ug_mu, AcceleratorRead); - autoView( Ug_nu_v , Ug_nu, AcceleratorRead); - - accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), { - GeneralStencilEntry const* e = gStencil_v.GetEntry(0,ss); - auto Udag_nu_x = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(1,ss); - auto Udag_mu_xpnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(2,ss); - auto U_nu_xpmu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd); - - auto stencil_ss = U_nu_xpmu * Udag_mu_xpnu * Udag_nu_x; + stencil += usecond() -t; - e = gStencil_v.GetEntry(3,ss); - auto U_nu_xmnu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd); - e = gStencil_v.GetEntry(4,ss); - auto Udag_mu_xmnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(5,ss); - auto Udag_nu_xmnu_pmu = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd)); - - stencil_ss = stencil_ss + Udag_nu_xmnu_pmu * Udag_mu_xmnu * U_nu_xmnu; - - coalescedWrite(gStaple_v[ss],stencil_ss); - } - ); + t=usecond(); + { + autoView( gStaple_v , gStaple, AcceleratorWrite); + auto gStencil_v = gStencil.View(); + autoView( Ug_mu_v , Ug_mu, AcceleratorRead); + autoView( Ug_nu_v , Ug_nu, AcceleratorRead); + accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), { + GeneralStencilEntry const* e = gStencil_v.GetEntry(0,ss); + auto Udag_nu_x = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(1,ss); + auto Udag_mu_xpnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(2,ss); + auto U_nu_xpmu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd); + + auto stencil_ss = U_nu_xpmu * Udag_mu_xpnu * Udag_nu_x; + + e = gStencil_v.GetEntry(3,ss); + auto U_nu_xmnu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(4,ss); + auto Udag_mu_xmnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(5,ss); + auto Udag_nu_xmnu_pmu = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd)); + + stencil_ss = stencil_ss + Udag_nu_xmnu_pmu * Udag_mu_xmnu * U_nu_xmnu; + + coalescedWrite(gStaple_v[ss],stencil_ss); + } + ); + } //ensure views are all closed! + kernel += usecond() - t; + + t=usecond(); staple = Ghost.Extract(gStaple); + extract += usecond()-t; + + total += usecond() - tstart; + std::cout << GridLogMessage << "StaplePadded timings peek:" << peek << " construct:" << construct << " exchange:" << exchange << " coord:" << coord << " stencil:" << stencil << " kernel:" << kernel << " extract:" << extract << " total:" << total << std::endl; } static void RectStapleOrig(GaugeMat &Stap, const GaugeLorentz &Umu, @@ -299,7 +325,7 @@ public: static void RectStaplePadded(GaugeMat &Stap, const GaugeLorentz &U, int mu) { - PaddedCell Ghost(2,U.Grid()); + PaddedCell Ghost(2,(GridCartesian*)U.Grid()); GridBase *ggrid = Ghost.grids.back(); CshiftImplGauge cshift_impl; @@ -361,117 +387,119 @@ public: size_t nshift = shifts.size(); GeneralLocalStencil gStencil(ggrid,shifts); - autoView( gStaple_v , gStaple, AcceleratorWrite); - auto gStencil_v = gStencil.View(); + { + autoView( gStaple_v , gStaple, AcceleratorWrite); + auto gStencil_v = gStencil.View(); - typedef LatticeView GaugeViewType; - size_t vsize = Nd*sizeof(GaugeViewType); - GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize); - for(int i=0;i GaugeViewType; + size_t vsize = Nd*sizeof(GaugeViewType); + GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize); + for(int i=0;ioSites(), ggrid->Nsimd(), { - decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss; - stencil_ss = Zero(); - int s=0; - for(int nu=0;nu_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - auto U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - auto U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); - e = gStencil_v.GetEntry(s++,ss); - auto U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), { + decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss; + stencil_ss = Zero(); + int s=0; + for(int nu=0;nu_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + auto U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + auto U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + auto U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); - stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; - //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu) - e = gStencil_v.GetEntry(s++,ss); - U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); - e = gStencil_v.GetEntry(s++,ss); - U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); - stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; - //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu) - e = gStencil_v.GetEntry(s++,ss); - U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); - e = gStencil_v.GetEntry(s++,ss); - U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); - e = gStencil_v.GetEntry(s++,ss); - U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); - stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; - //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu) - e = gStencil_v.GetEntry(s++,ss); - U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); - e = gStencil_v.GetEntry(s++,ss); - U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); - stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; - //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x) - e = gStencil_v.GetEntry(s++,ss); - U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); - e = gStencil_v.GetEntry(s++,ss); - U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x) + e = gStencil_v.GetEntry(s++,ss); + U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); - stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; - //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu) - e = gStencil_v.GetEntry(s++,ss); - U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); - e = gStencil_v.GetEntry(s++,ss); - U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); - e = gStencil_v.GetEntry(s++,ss); - U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); - e = gStencil_v.GetEntry(s++,ss); - U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); - stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + } } + assert(s==nshift); + coalescedWrite(gStaple_v[ss],stencil_ss); } - assert(s==nshift); - coalescedWrite(gStaple_v[ss],stencil_ss); - } - ); + ); - Stap = Ghost.Extract(gStaple); - for(int i=0;i::GaugeMat GaugeMat; typedef typename WilsonLoopsTest::GaugeLorentz GaugeLorentz; - std::cout << "Checking Staple" << std::endl; + std::cout << GridLogMessage << "Checking Staple" << std::endl; + int count = 0; + double torig=0, tpadded=0; + for(int mu=0;mu::StapleOrig(staple_orig,U,mu,nu); + double t0 = usecond(); + WilsonLoopsTest::StapleOrig(staple_orig,U,mu,nu); + double t1 = usecond(); WilsonLoopsTest::StaplePadded(staple_padded,U,mu,nu); + double t2 = usecond(); + torig += t1-t0; tpadded += t2-t1; + ++count; + GaugeMat diff = staple_orig - staple_padded; double n = norm2(diff); - std::cout << mu << " " << nu << " " << n << std::endl; + std::cout << GridLogMessage << mu << " " << nu << " " << n << std::endl; assert(n<1e-10); } } } - - std::cout << "Checking RectStaple" << std::endl; + std::cout << GridLogMessage << "Staple timings orig: " << torig/1000/count << "ms, padded: " << tpadded/1000/count << "ms" << std::endl; + count=0; torig=tpadded=0; + + std::cout << GridLogMessage << "Checking RectStaple" << std::endl; for(int mu=0;mu::RectStapleOrig(staple_orig,U,mu); + double t0 = usecond(); + WilsonLoopsTest::RectStapleOrig(staple_orig,U,mu); + double t1 = usecond(); WilsonLoopsTest::RectStaplePadded(staple_padded,U,mu); + double t2 = usecond(); + torig += t1-t0; tpadded += t2-t1; + ++count; + GaugeMat diff = staple_orig - staple_padded; double n = norm2(diff); - std::cout << mu << " " << n << std::endl; + std::cout << GridLogMessage << mu << " " << n << std::endl; assert(n<1e-10); } - + std::cout << GridLogMessage << "RectStaple timings orig: " << torig/1000/count << "ms, padded: " << tpadded/1000/count << "ms" << std::endl; + Grid_finalize(); }