mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-15 14:27:06 +01:00
Reimplemented precisionChange to run on GPUs. A workspace containing the mapping table can be optionally precomputed and reused for improved performance.
This commit is contained in:
@ -998,54 +998,116 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
//Convert a Lattice from one precision to another
|
//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
|
||||||
template<class VobjOut, class VobjIn>
|
class precisionChangeWorkspace{
|
||||||
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
|
std::pair<Integer,Integer>* fmap_device; //device pointer
|
||||||
{
|
public:
|
||||||
assert(out.Grid()->Nd() == in.Grid()->Nd());
|
precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid){
|
||||||
for(int d=0;d<out.Grid()->Nd();d++){
|
//Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
|
||||||
assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
|
assert(out_grid->Nd() == in_grid->Nd());
|
||||||
|
for(int d=0;d<out_grid->Nd();d++){
|
||||||
|
assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
|
||||||
}
|
}
|
||||||
|
int Nsimd_out = out_grid->Nsimd();
|
||||||
|
|
||||||
|
std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
|
||||||
|
for(int lane=0; lane < out_grid->Nsimd(); lane++)
|
||||||
|
out_grid->iCoorFromIindex(out_icorrs[lane], lane);
|
||||||
|
|
||||||
|
std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
|
||||||
|
thread_for(out_oidx,out_grid->oSites(),{
|
||||||
|
Coordinate out_ocorr;
|
||||||
|
out_grid->oCoorFromOindex(out_ocorr, out_oidx);
|
||||||
|
|
||||||
|
Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
|
||||||
|
for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
|
||||||
|
out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
|
||||||
|
|
||||||
|
//int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
|
||||||
|
//Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
|
||||||
|
//Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
|
||||||
|
int in_oidx = 0, in_lane = 0;
|
||||||
|
for(int d=0;d<in_grid->_ndimension;d++){
|
||||||
|
in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
|
||||||
|
in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
|
||||||
|
}
|
||||||
|
fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
//Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
|
||||||
|
size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
|
||||||
|
fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
|
||||||
|
acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
//Prevent moving or copying
|
||||||
|
precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete;
|
||||||
|
precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete;
|
||||||
|
precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete;
|
||||||
|
precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete;
|
||||||
|
|
||||||
|
std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
|
||||||
|
|
||||||
|
~precisionChangeWorkspace(){
|
||||||
|
acceleratorFreeDevice(fmap_device);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
//Convert a lattice of one precision to another. The input workspace contains the mapping data.
|
||||||
|
template<class VobjOut, class VobjIn>
|
||||||
|
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
GridBase *in_grid=in.Grid();
|
|
||||||
GridBase *out_grid = out.Grid();
|
|
||||||
|
|
||||||
typedef typename VobjOut::scalar_object SobjOut;
|
typedef typename VobjOut::scalar_object SobjOut;
|
||||||
typedef typename VobjIn::scalar_object SobjIn;
|
typedef typename VobjIn::scalar_object SobjIn;
|
||||||
|
|
||||||
int ndim = out.Grid()->Nd();
|
typedef typename SobjIn::scalar_type SfundIn; //"fundamental" complex/real data types
|
||||||
int out_nsimd = out_grid->Nsimd();
|
typedef typename SobjOut::scalar_type SfundOut;
|
||||||
|
|
||||||
std::vector<Coordinate > out_icoor(out_nsimd);
|
constexpr int Nsimd_out = VobjOut::Nsimd();
|
||||||
|
constexpr int Nfund_in = sizeof(SobjIn)/sizeof(SfundIn);
|
||||||
|
constexpr int Nfund_out = sizeof(SobjOut)/sizeof(SfundOut); //these should be the same!
|
||||||
|
|
||||||
for(int lane=0; lane < out_nsimd; lane++){
|
static_assert(Nfund_in == Nfund_out, "Expect input and output object types to contain the same number of fundamental data but with different precision!");
|
||||||
out_icoor[lane].resize(ndim);
|
|
||||||
out_grid->iCoorFromIindex(out_icoor[lane], lane);
|
std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
|
||||||
|
|
||||||
|
//Do the copy/precision change
|
||||||
|
autoView( out_v , out, AcceleratorWrite);
|
||||||
|
autoView( in_v , in, AcceleratorRead);
|
||||||
|
|
||||||
|
accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
|
||||||
|
std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
|
||||||
|
for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
|
||||||
|
int in_oidx = fmap_osite[out_lane].first;
|
||||||
|
int in_lane = fmap_osite[out_lane].second;
|
||||||
|
|
||||||
|
//Room for optimization here by combining the precision change with the read/write to avoid the intermediate scalar objects
|
||||||
|
SobjIn sobj_in = extractLane(in_lane, in_v[in_oidx]);
|
||||||
|
SobjOut sobj_out;
|
||||||
|
SfundIn tmp_in;
|
||||||
|
SfundOut tmp_out;
|
||||||
|
for(int i=0;i<Nfund_out;i++){ //the blessed way to do type punning!
|
||||||
|
memcpy( (char*)(&tmp_in), (char*)(&sobj_in) + i*sizeof(SfundIn), sizeof(SfundIn) );
|
||||||
|
tmp_out = tmp_in; //the precision change
|
||||||
|
memcpy( (char*)(&sobj_out) + i*sizeof(SfundOut), (char*)(&tmp_out), sizeof(SfundOut) );
|
||||||
}
|
}
|
||||||
|
insertLane(out_lane, out_v[out_oidx], sobj_out);
|
||||||
std::vector<SobjOut> in_slex_conv(in_grid->lSites());
|
|
||||||
unvectorizeToLexOrdArray(in_slex_conv, in);
|
|
||||||
|
|
||||||
autoView( out_v , out, CpuWrite);
|
|
||||||
thread_for(out_oidx,out_grid->oSites(),{
|
|
||||||
Coordinate out_ocoor(ndim);
|
|
||||||
out_grid->oCoorFromOindex(out_ocoor, out_oidx);
|
|
||||||
|
|
||||||
ExtractPointerArray<SobjOut> ptrs(out_nsimd);
|
|
||||||
|
|
||||||
Coordinate lcoor(out_grid->Nd());
|
|
||||||
|
|
||||||
for(int lane=0; lane < out_nsimd; lane++){
|
|
||||||
for(int mu=0;mu<ndim;mu++)
|
|
||||||
lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu];
|
|
||||||
|
|
||||||
int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
|
|
||||||
ptrs[lane] = &in_slex_conv[llex];
|
|
||||||
}
|
}
|
||||||
merge(out_v[out_oidx], ptrs, 0);
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//Convert a Lattice from one precision to another
|
||||||
|
//Generate the workspace in place; if multiple calls with the same mapping are performed, consider pregenerating the workspace and reusing
|
||||||
|
template<class VobjOut, class VobjIn>
|
||||||
|
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
|
||||||
|
precisionChangeWorkspace workspace(out.Grid(), in.Grid());
|
||||||
|
precisionChange(out, in, workspace);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Communicate between grids
|
// Communicate between grids
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
Reference in New Issue
Block a user