From d7a2a4852d1280d9b80b17cd827073d1c3558274 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Wed, 6 Jan 2021 09:30:49 -0500 Subject: [PATCH] Reimplemented precisionChange to run on GPUs. A workspace containing the mapping table can be optionally precomputed and reused for improved performance. --- Grid/lattice/Lattice_transfer.h | 136 +++++++++++++++++++++++--------- 1 file changed, 99 insertions(+), 37 deletions(-) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 91de721f..9af347e8 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -998,54 +998,116 @@ vectorizeFromRevLexOrdArray( std::vector &in, Lattice &out) }); } -//Convert a Lattice from one precision to another -template -void precisionChange(Lattice &out, const Lattice &in) -{ - assert(out.Grid()->Nd() == in.Grid()->Nd()); - for(int d=0;dNd();d++){ - assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]); +//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls +class precisionChangeWorkspace{ + std::pair* fmap_device; //device pointer +public: + precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid){ + //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device + assert(out_grid->Nd() == in_grid->Nd()); + for(int d=0;dNd();d++){ + assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]); + } + int Nsimd_out = out_grid->Nsimd(); + + std::vector out_icorrs(out_grid->Nsimd()); //reuse these + for(int lane=0; lane < out_grid->Nsimd(); lane++) + out_grid->iCoorFromIindex(out_icorrs[lane], lane); + + std::vector > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd + thread_for(out_oidx,out_grid->oSites(),{ + Coordinate out_ocorr; + out_grid->oCoorFromOindex(out_ocorr, out_oidx); + + Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate) + for(int out_lane=0; out_lane < Nsimd_out; out_lane++){ + out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr); + + //int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr); + //Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice + //Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity + int in_oidx = 0, in_lane = 0; + for(int d=0;d_ndimension;d++){ + in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] ); + in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] ); + } + fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair( in_oidx, in_lane ); + } + }); + + //Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines) + size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair); + fmap_device = (std::pair*)acceleratorAllocDevice(fmap_bytes); + acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes); } + + //Prevent moving or copying + precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete; + precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete; + precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete; + precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete; + + std::pair const* getMap() const{ return fmap_device; } + + ~precisionChangeWorkspace(){ + acceleratorFreeDevice(fmap_device); + } +}; + + +//Convert a lattice of one precision to another. The input workspace contains the mapping data. +template +void precisionChange(Lattice &out, const Lattice &in, const precisionChangeWorkspace &workspace){ out.Checkerboard() = in.Checkerboard(); - GridBase *in_grid=in.Grid(); - GridBase *out_grid = out.Grid(); typedef typename VobjOut::scalar_object SobjOut; typedef typename VobjIn::scalar_object SobjIn; - int ndim = out.Grid()->Nd(); - int out_nsimd = out_grid->Nsimd(); - - std::vector out_icoor(out_nsimd); - - for(int lane=0; lane < out_nsimd; lane++){ - out_icoor[lane].resize(ndim); - out_grid->iCoorFromIindex(out_icoor[lane], lane); - } - - std::vector in_slex_conv(in_grid->lSites()); - unvectorizeToLexOrdArray(in_slex_conv, in); - - autoView( out_v , out, CpuWrite); - thread_for(out_oidx,out_grid->oSites(),{ - Coordinate out_ocoor(ndim); - out_grid->oCoorFromOindex(out_ocoor, out_oidx); + typedef typename SobjIn::scalar_type SfundIn; //"fundamental" complex/real data types + typedef typename SobjOut::scalar_type SfundOut; - ExtractPointerArray ptrs(out_nsimd); + constexpr int Nsimd_out = VobjOut::Nsimd(); + constexpr int Nfund_in = sizeof(SobjIn)/sizeof(SfundIn); + constexpr int Nfund_out = sizeof(SobjOut)/sizeof(SfundOut); //these should be the same! - Coordinate lcoor(out_grid->Nd()); - - for(int lane=0; lane < out_nsimd; lane++){ - for(int mu=0;mu_rdimensions[mu]*out_icoor[lane][mu]; + static_assert(Nfund_in == Nfund_out, "Expect input and output object types to contain the same number of fundamental data but with different precision!"); + + std::pair const* fmap_device = workspace.getMap(); + + //Do the copy/precision change + autoView( out_v , out, AcceleratorWrite); + autoView( in_v , in, AcceleratorRead); + + accelerator_for(out_oidx, out.Grid()->oSites(), 1,{ + std::pair const* fmap_osite = fmap_device + out_oidx*Nsimd_out; + for(int out_lane=0; out_lane < Nsimd_out; out_lane++){ + int in_oidx = fmap_osite[out_lane].first; + int in_lane = fmap_osite[out_lane].second; - int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions); - ptrs[lane] = &in_slex_conv[llex]; - } - merge(out_v[out_oidx], ptrs, 0); - }); + //Room for optimization here by combining the precision change with the read/write to avoid the intermediate scalar objects + SobjIn sobj_in = extractLane(in_lane, in_v[in_oidx]); + SobjOut sobj_out; + SfundIn tmp_in; + SfundOut tmp_out; + for(int i=0;i +void precisionChange(Lattice &out, const Lattice &in){ + precisionChangeWorkspace workspace(out.Grid(), in.Grid()); + precisionChange(out, in, workspace); +} + + //////////////////////////////////////////////////////////////////////////////// // Communicate between grids ////////////////////////////////////////////////////////////////////////////////