diff --git a/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h b/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h index fcd79d5f..fc464ea1 100644 --- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h +++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h @@ -62,6 +62,7 @@ public: std::vector > _A; std::vector MultTemporaries; + deviceVector StencilMasked; /////////////////////// // Interface @@ -78,9 +79,40 @@ public: Stencil(Cell.grids.back(),geom.shifts) { _A.resize(geom.npoint); + int32_t padded_sites = _Op._A[0].Grid()->lSites(); for(int p=0;plSites()); + _A[p].resize(padded_sites); } + std::cout << GridLogMessage<<"MultiGeneralCoarsenedMatrix "<<_CoarseGrid->lSites()<<" coarse sites "<<_Op._A[0].Grid()->lSites() <oSites()); + std::vector StencilTmp; + + int32_t j=0; + int32_t sites = Stencil._entries.size()/geom.npoint; + for(int32_t s=0;soSites()*geom.npoint<< " stencil interior size "<< StencilTmp.size()<lSites()*geom.npoint==StencilTmp.size()); + acceleratorCopyToDevice(&StencilTmp[0],&StencilMasked[0],sizeof(GeneralStencilEntryReordered)*StencilTmp.size()); CopyMatrix(); } void CopyMatrix (void) @@ -100,12 +132,18 @@ public: } void M (const CoarseVector &in, CoarseVector &out) { + RealD tviews=0; RealD ttot=0; RealD tmult=0; RealD texch=0; RealD text=0; RealD ttemps=0; RealD tcopy=0; + RealD tmult2=0; + + ttot=-usecond(); conformable(CoarseGrid(),in.Grid()); conformable(in.Grid(),out.Grid()); out.Checkerboard() = in.Checkerboard(); CoarseVector tin=in; + texch-=usecond(); CoarseVector pin = Cell.ExchangePeriodic(tin); + texch+=usecond(); CoarseVector pout(pin.Grid()); int npoint = geom.npoint; @@ -116,22 +154,33 @@ public: int64_t osites=pin.Grid()->oSites(); int64_t nrhs =pin.Grid()->GlobalDimensions()[0]/Nsimd; + assert(nrhs>=1); + RealD flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd(); + RealD bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0] + + 2.0*osites*sizeof(siteVector)*npoint; + { + tviews-=usecond(); autoView( in_v , pin, AcceleratorRead); autoView( out_v , pout, AcceleratorWriteDiscard); autoView( Stencil_v , Stencil, AcceleratorRead); + tviews+=usecond(); // Static and prereserve to keep UVM region live and not resized across multiple calls + ttemps-=usecond(); MultTemporaries.resize(npoint,pin.Grid()); + ttemps+=usecond(); std::vector AcceleratorViewContainer_h; std::vector AcceleratorVecViewContainer_h; + tviews-=usecond(); for(int p=0;p AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint); static deviceVector AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint); @@ -139,15 +188,23 @@ public: auto Aview_p = &AcceleratorViewContainer[0]; auto Vview_p = &AcceleratorVecViewContainer[0]; + tcopy-=usecond(); acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview)); acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview)); + tcopy+=usecond(); + int32_t bound = _A[0].size(); + std::cout << " osites "<LocalDimensions()<LocalDimensions()<_permute == 0 ) { int32_t snbr= SE->_offset; @@ -159,6 +216,7 @@ public: coalescedWrite(Vview_p[point][ss](b),res); } }); + tmult2-=usecond(); accelerator_for(sb, osites*nbasis, Nsimd, { int ss = sb/nbasis; int b = sb%nbasis; @@ -168,13 +226,32 @@ public: } coalescedWrite(out_v[ss](b),res); }); + tmult2+=usecond(); + tmult+=usecond(); for(int p=0;p &From,Lattice & To,Coordinate Fro typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; - static const int words=sizeof(vobj)/sizeof(vector_type); - GridBase *Fg = From.Grid(); GridBase *Tg = To.Grid(); assert(!Fg->_isCheckerBoarded); @@ -763,13 +761,14 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro // the above should guarantee that the operations are local #if 1 - size_t nsite = 1; for(int i=0;i &From,Lattice & To,Coordinate Fro int* table_d = (int*)acceleratorAllocDevice(tbytes); acceleratorCopyToDevice(table,table_d,tbytes); +#else + int* table_d = (int*)acceleratorAllocDevice(tbytes); + Coordinate f_ostride = Fg->_ostride; + Coordinate f_istride = Fg->_istride; + Coordinate f_rdimensions = Fg->_rdimensions; + Coordinate t_ostride = Tg->_ostride; + Coordinate t_istride = Tg->_istride; + Coordinate t_rdimensions = Tg->_rdimensions; + + accelerator_for(idx, nsite, 1, { + Coordinate from_coor, to_coor; + size_t rem = idx; + for(int i=0;i &From,Lattice & To,Coordinate Fro vector_type* to = (vector_type *)&to_v[to_oidx]; scalar_type stmp; +#ifdef GRID_SIMT + int w = acceleratorSIMTlane(words); + stmp = getlane(from[w], from_lane); + putlane(to[w], stmp, to_lane); +#else for(int w=0;w &logstreams) { for (int i = 0; i < logstreams.size(); i++) { if (logstreams[i] == std::string("Tracing")) GridLogTracing.Active(1); - if (logstreams[i] == std::string("Memory")) GridLogMemory.Active(1); + if (logstreams[i] == std::string("Memory")) GridLogMemory.Active(0); if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1); if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0); if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1); diff --git a/Grid/stencil/GeneralLocalStencil.h b/Grid/stencil/GeneralLocalStencil.h index 04eaadb7..1a36ad43 100644 --- a/Grid/stencil/GeneralLocalStencil.h +++ b/Grid/stencil/GeneralLocalStencil.h @@ -33,6 +33,10 @@ struct GeneralStencilEntry { uint64_t _offset; // 4 bytes uint8_t _permute; // 1 bytes // Horrible alignment properties }; +struct GeneralStencilEntryReordered : public GeneralStencilEntry { + uint64_t _output; +}; + // Could pack to 8 + 4 + 4 = 128 bit and use class GeneralLocalStencilView { diff --git a/systems/Frontier/config-command b/systems/Frontier/config-command index 41335bd9..53b8e822 100644 --- a/systems/Frontier/config-command +++ b/systems/Frontier/config-command @@ -6,6 +6,7 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-` --enable-tracing=timer \ --enable-accelerator=hip \ --enable-gen-simd-width=64 \ +--enable-tracing=roctx \ --disable-gparity \ --disable-fermion-reps \ --enable-simd=GPU \ diff --git a/tests/debug/Test_general_coarse.cc b/tests/debug/Test_general_coarse.cc index f93f5790..f980d7c7 100644 --- a/tests/debug/Test_general_coarse.cc +++ b/tests/debug/Test_general_coarse.cc @@ -78,7 +78,7 @@ int main (int argc, char ** argv) // Construct a coarsened grid Coordinate clatt = GridDefaultLatt(); for(int d=0;d