From 1fb41a4300346bfb4e4f315999a6bf9259e720d7 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Wed, 6 Jan 2021 11:50:56 -0500 Subject: [PATCH] Added copyLane function to Tensor_extract_merge.h which copies one lane of data from an input tensor object to a different lane of an output tensor object of potentially different precision precisionChange lattice function now uses copyLane to remove need for temporary scalar objects, reducing register footprint and significantly improving performance --- Grid/lattice/Lattice_transfer.h | 26 +++--------------- Grid/tensors/Tensor_extract_merge.h | 41 +++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 23 deletions(-) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 9af347e8..0a5bd458 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -1058,19 +1058,10 @@ public: //Convert a lattice of one precision to another. The input workspace contains the mapping data. template void precisionChange(Lattice &out, const Lattice &in, const precisionChangeWorkspace &workspace){ + static_assert( std::is_same::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same + out.Checkerboard() = in.Checkerboard(); - - typedef typename VobjOut::scalar_object SobjOut; - typedef typename VobjIn::scalar_object SobjIn; - - typedef typename SobjIn::scalar_type SfundIn; //"fundamental" complex/real data types - typedef typename SobjOut::scalar_type SfundOut; - constexpr int Nsimd_out = VobjOut::Nsimd(); - constexpr int Nfund_in = sizeof(SobjIn)/sizeof(SfundIn); - constexpr int Nfund_out = sizeof(SobjOut)/sizeof(SfundOut); //these should be the same! - - static_assert(Nfund_in == Nfund_out, "Expect input and output object types to contain the same number of fundamental data but with different precision!"); std::pair const* fmap_device = workspace.getMap(); @@ -1083,18 +1074,7 @@ void precisionChange(Lattice &out, const Lattice &in, const pre for(int out_lane=0; out_lane < Nsimd_out; out_lane++){ int in_oidx = fmap_osite[out_lane].first; int in_lane = fmap_osite[out_lane].second; - - //Room for optimization here by combining the precision change with the read/write to avoid the intermediate scalar objects - SobjIn sobj_in = extractLane(in_lane, in_v[in_oidx]); - SobjOut sobj_out; - SfundIn tmp_in; - SfundOut tmp_out; - for(int i=0;i &extracted, int offset) } + +////////////////////////////////////////////////////////////////////////////////// +//Copy a single lane of a SIMD tensor type from one object to another +//Output object must be of the same tensor type but may be of a different precision (i.e. it can have a different root data type) +/////////////////////////////////////////////////////////////////////////////////// +template +accelerator_inline +void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __restrict__ vecIn, int lane_in) +{ + static_assert( std::is_same::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same + + typedef typename vobjOut::vector_type ovector_type; + typedef typename vobjIn::vector_type ivector_type; + constexpr int owords=sizeof(vobjOut)/sizeof(ovector_type); + constexpr int iwords=sizeof(vobjIn)/sizeof(ivector_type); + static_assert( owords == iwords, "copyLane: Expected number of vector words in input and output objects to be equal" ); + + typedef typename vobjOut::scalar_type oscalar_type; + typedef typename vobjIn::scalar_type iscalar_type; + typedef typename ExtractTypeMap::extract_type oextract_type; + typedef typename ExtractTypeMap::extract_type iextract_type; + + typedef oextract_type * opointer; + typedef iextract_type * ipointer; + + constexpr int oNsimd=ovector_type::Nsimd(); + constexpr int iNsimd=ivector_type::Nsimd(); + + iscalar_type itmp; + oscalar_type otmp; + + opointer __restrict__ op = (opointer)&vecOut; + ipointer __restrict__ ip = (ipointer)&vecIn; + for(int w=0;w