1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-03 18:55:56 +01:00

Added copyLane function to Tensor_extract_merge.h which copies one lane of data from an input tensor object to a different lane of an output tensor object of potentially different precision

precisionChange lattice function now uses copyLane to remove need for temporary scalar objects, reducing register footprint and significantly improving performance
This commit is contained in:
Christopher Kelly 2021-01-06 11:50:56 -05:00
parent 287bac946f
commit 1fb41a4300
2 changed files with 44 additions and 23 deletions

View File

@ -1058,19 +1058,10 @@ public:
//Convert a lattice of one precision to another. The input workspace contains the mapping data.
template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
static_assert( std::is_same<typename VobjOut::DoublePrecision, typename VobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
out.Checkerboard() = in.Checkerboard();
typedef typename VobjOut::scalar_object SobjOut;
typedef typename VobjIn::scalar_object SobjIn;
typedef typename SobjIn::scalar_type SfundIn; //"fundamental" complex/real data types
typedef typename SobjOut::scalar_type SfundOut;
constexpr int Nsimd_out = VobjOut::Nsimd();
constexpr int Nfund_in = sizeof(SobjIn)/sizeof(SfundIn);
constexpr int Nfund_out = sizeof(SobjOut)/sizeof(SfundOut); //these should be the same!
static_assert(Nfund_in == Nfund_out, "Expect input and output object types to contain the same number of fundamental data but with different precision!");
std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
@ -1083,18 +1074,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const pre
for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
int in_oidx = fmap_osite[out_lane].first;
int in_lane = fmap_osite[out_lane].second;
//Room for optimization here by combining the precision change with the read/write to avoid the intermediate scalar objects
SobjIn sobj_in = extractLane(in_lane, in_v[in_oidx]);
SobjOut sobj_out;
SfundIn tmp_in;
SfundOut tmp_out;
for(int i=0;i<Nfund_out;i++){ //the blessed way to do type punning!
memcpy( (char*)(&tmp_in), (char*)(&sobj_in) + i*sizeof(SfundIn), sizeof(SfundIn) );
tmp_out = tmp_in; //the precision change
memcpy( (char*)(&sobj_out) + i*sizeof(SfundOut), (char*)(&tmp_out), sizeof(SfundOut) );
}
insertLane(out_lane, out_v[out_oidx], sobj_out);
copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
}
});
}

View File

@ -208,5 +208,46 @@ void merge(vobj &vec,ExtractPointerArray<sobj> &extracted, int offset)
}
//////////////////////////////////////////////////////////////////////////////////
//Copy a single lane of a SIMD tensor type from one object to another
//Output object must be of the same tensor type but may be of a different precision (i.e. it can have a different root data type)
///////////////////////////////////////////////////////////////////////////////////
template<class vobjOut, class vobjIn>
accelerator_inline
void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __restrict__ vecIn, int lane_in)
{
static_assert( std::is_same<typename vobjOut::DoublePrecision, typename vobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
typedef typename vobjOut::vector_type ovector_type;
typedef typename vobjIn::vector_type ivector_type;
constexpr int owords=sizeof(vobjOut)/sizeof(ovector_type);
constexpr int iwords=sizeof(vobjIn)/sizeof(ivector_type);
static_assert( owords == iwords, "copyLane: Expected number of vector words in input and output objects to be equal" );
typedef typename vobjOut::scalar_type oscalar_type;
typedef typename vobjIn::scalar_type iscalar_type;
typedef typename ExtractTypeMap<oscalar_type>::extract_type oextract_type;
typedef typename ExtractTypeMap<iscalar_type>::extract_type iextract_type;
typedef oextract_type * opointer;
typedef iextract_type * ipointer;
constexpr int oNsimd=ovector_type::Nsimd();
constexpr int iNsimd=ivector_type::Nsimd();
iscalar_type itmp;
oscalar_type otmp;
opointer __restrict__ op = (opointer)&vecOut;
ipointer __restrict__ ip = (ipointer)&vecIn;
for(int w=0;w<owords;w++){
memcpy( (char*)&itmp, (char*)(ip + lane_in + iNsimd*w), sizeof(iscalar_type) );
otmp = itmp; //potential precision change
memcpy( (char*)(op + lane_out + oNsimd*w), (char*)&otmp, sizeof(oscalar_type) );
}
}
NAMESPACE_END(Grid);