mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-03 18:55:56 +01:00
Added copyLane function to Tensor_extract_merge.h which copies one lane of data from an input tensor object to a different lane of an output tensor object of potentially different precision
precisionChange lattice function now uses copyLane to remove need for temporary scalar objects, reducing register footprint and significantly improving performance
This commit is contained in:
parent
287bac946f
commit
1fb41a4300
@ -1058,19 +1058,10 @@ public:
|
||||
//Convert a lattice of one precision to another. The input workspace contains the mapping data.
|
||||
template<class VobjOut, class VobjIn>
|
||||
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
|
||||
static_assert( std::is_same<typename VobjOut::DoublePrecision, typename VobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
|
||||
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
|
||||
typedef typename VobjOut::scalar_object SobjOut;
|
||||
typedef typename VobjIn::scalar_object SobjIn;
|
||||
|
||||
typedef typename SobjIn::scalar_type SfundIn; //"fundamental" complex/real data types
|
||||
typedef typename SobjOut::scalar_type SfundOut;
|
||||
|
||||
constexpr int Nsimd_out = VobjOut::Nsimd();
|
||||
constexpr int Nfund_in = sizeof(SobjIn)/sizeof(SfundIn);
|
||||
constexpr int Nfund_out = sizeof(SobjOut)/sizeof(SfundOut); //these should be the same!
|
||||
|
||||
static_assert(Nfund_in == Nfund_out, "Expect input and output object types to contain the same number of fundamental data but with different precision!");
|
||||
|
||||
std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
|
||||
|
||||
@ -1083,18 +1074,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const pre
|
||||
for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
|
||||
int in_oidx = fmap_osite[out_lane].first;
|
||||
int in_lane = fmap_osite[out_lane].second;
|
||||
|
||||
//Room for optimization here by combining the precision change with the read/write to avoid the intermediate scalar objects
|
||||
SobjIn sobj_in = extractLane(in_lane, in_v[in_oidx]);
|
||||
SobjOut sobj_out;
|
||||
SfundIn tmp_in;
|
||||
SfundOut tmp_out;
|
||||
for(int i=0;i<Nfund_out;i++){ //the blessed way to do type punning!
|
||||
memcpy( (char*)(&tmp_in), (char*)(&sobj_in) + i*sizeof(SfundIn), sizeof(SfundIn) );
|
||||
tmp_out = tmp_in; //the precision change
|
||||
memcpy( (char*)(&sobj_out) + i*sizeof(SfundOut), (char*)(&tmp_out), sizeof(SfundOut) );
|
||||
}
|
||||
insertLane(out_lane, out_v[out_oidx], sobj_out);
|
||||
copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@ -208,5 +208,46 @@ void merge(vobj &vec,ExtractPointerArray<sobj> &extracted, int offset)
|
||||
}
|
||||
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
//Copy a single lane of a SIMD tensor type from one object to another
|
||||
//Output object must be of the same tensor type but may be of a different precision (i.e. it can have a different root data type)
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
template<class vobjOut, class vobjIn>
|
||||
accelerator_inline
|
||||
void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __restrict__ vecIn, int lane_in)
|
||||
{
|
||||
static_assert( std::is_same<typename vobjOut::DoublePrecision, typename vobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
|
||||
|
||||
typedef typename vobjOut::vector_type ovector_type;
|
||||
typedef typename vobjIn::vector_type ivector_type;
|
||||
constexpr int owords=sizeof(vobjOut)/sizeof(ovector_type);
|
||||
constexpr int iwords=sizeof(vobjIn)/sizeof(ivector_type);
|
||||
static_assert( owords == iwords, "copyLane: Expected number of vector words in input and output objects to be equal" );
|
||||
|
||||
typedef typename vobjOut::scalar_type oscalar_type;
|
||||
typedef typename vobjIn::scalar_type iscalar_type;
|
||||
typedef typename ExtractTypeMap<oscalar_type>::extract_type oextract_type;
|
||||
typedef typename ExtractTypeMap<iscalar_type>::extract_type iextract_type;
|
||||
|
||||
typedef oextract_type * opointer;
|
||||
typedef iextract_type * ipointer;
|
||||
|
||||
constexpr int oNsimd=ovector_type::Nsimd();
|
||||
constexpr int iNsimd=ivector_type::Nsimd();
|
||||
|
||||
iscalar_type itmp;
|
||||
oscalar_type otmp;
|
||||
|
||||
opointer __restrict__ op = (opointer)&vecOut;
|
||||
ipointer __restrict__ ip = (ipointer)&vecIn;
|
||||
for(int w=0;w<owords;w++){
|
||||
memcpy( (char*)&itmp, (char*)(ip + lane_in + iNsimd*w), sizeof(iscalar_type) );
|
||||
otmp = itmp; //potential precision change
|
||||
memcpy( (char*)(op + lane_out + oNsimd*w), (char*)&otmp, sizeof(oscalar_type) );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user