1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-08-17 11:41:53 +01:00

Make view specify where and drive data motion - first cut.

This is a compile tiime option --enable-unified=yes/no
This commit is contained in:
Peter Boyle
2020-05-21 16:13:16 -04:00
parent ebb60330c9
commit 7860a50f70
48 changed files with 688 additions and 718 deletions

View File

@@ -49,8 +49,8 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
half.Checkerboard() = cb;
auto half_v = half.View();
auto full_v = full.View();
auto half_v = half.View(CpuWrite);
auto full_v = full.View(CpuRead);
thread_for(ss, full.Grid()->oSites(),{
int cbos;
Coordinate coor;
@@ -65,8 +65,8 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
}
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
int cb = half.Checkerboard();
auto half_v = half.View();
auto full_v = full.View();
auto half_v = half.View(CpuRead);
auto full_v = full.View(CpuWrite);
thread_for(ss,full.Grid()->oSites(),{
Coordinate coor;
@@ -92,9 +92,8 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
Lattice<CComplex> ip(coarse);
// auto fineData_ = fineData.View();
auto coarseData_ = coarseData.View();
auto ip_ = ip.View();
auto coarseData_ = coarseData.View(AcceleratorWrite);
auto ip_ = ip.View(AcceleratorWrite);
for(int v=0;v<nbasis;v++) {
blockInnerProduct(ip,Basis[v],fineData);
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
@@ -102,7 +101,7 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
});
}
}
#if 0
template<class vobj,class CComplex,int nbasis>
inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
const Lattice<vobj> &fineData,
@@ -132,8 +131,8 @@ inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
coarseData=Zero();
auto fineData_ = fineData.View();
auto coarseData_ = coarseData.View();
auto fineData_ = fineData.View(AcceleratorRead);
auto coarseData_ = coarseData.View(AcceleratorWrite);
////////////////////////////////////////////////////////////////////////////////////////////////////////
// To make this lock free, loop over coars parallel, and then loop over fine associated with coarse.
// Otherwise do fine inner product per site, and make the update atomic
@@ -142,7 +141,7 @@ inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
auto sc=sci/nbasis;
auto i=sci%nbasis;
auto Basis_ = Basis[i].View();
auto Basis_ = Basis[i].View(AcceleratorRead);
Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate
@@ -165,6 +164,7 @@ inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
});
return;
}
#endif
template<class vobj,class CComplex>
inline void blockZAXPY(Lattice<vobj> &fineZ,
@@ -191,10 +191,10 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
}
auto fineZ_ = fineZ.View();
auto fineX_ = fineX.View();
auto fineY_ = fineY.View();
auto coarseA_= coarseA.View();
auto fineZ_ = fineZ.View(AcceleratorWrite);
auto fineX_ = fineX.View(AcceleratorRead);
auto fineY_ = fineY.View(AcceleratorRead);
auto coarseA_= coarseA.View(AcceleratorRead);
accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
@@ -227,11 +227,10 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
Lattice<dotp> coarse_inner(coarse);
// Precision promotion?
auto CoarseInner_ = CoarseInner.View();
auto coarse_inner_ = coarse_inner.View();
fine_inner = localInnerProduct(fineX,fineY);
blockSum(coarse_inner,fine_inner);
auto CoarseInner_ = CoarseInner.View(AcceleratorWrite);
auto coarse_inner_ = coarse_inner.View(AcceleratorRead);
accelerator_for(ss, coarse->oSites(), 1, {
CoarseInner_[ss] = coarse_inner_[ss];
});
@@ -266,8 +265,8 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
// Turn this around to loop threaded over sc and interior loop
// over sf would thread better
auto coarseData_ = coarseData.View();
auto fineData_ = fineData.View();
auto coarseData_ = coarseData.View(AcceleratorWrite);
auto fineData_ = fineData.View(AcceleratorRead);
accelerator_for(sc,coarse->oSites(),1,{
@@ -360,8 +359,8 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
}
auto fineData_ = fineData.View();
auto coarseData_ = coarseData.View();
auto fineData_ = fineData.View(AcceleratorWrite);
auto coarseData_ = coarseData.View(AcceleratorRead);
// Loop with a cache friendly loop ordering
accelerator_for(sf,fine->oSites(),1,{
@@ -374,7 +373,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
for(int i=0;i<nbasis;i++) {
auto basis_ = Basis[i].View();
/* auto basis_ = Basis[i].View( );*/
if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
}
@@ -395,8 +394,8 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
for(int i=0;i<nbasis;i++) {
Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
Lattice<CComplex> cip(coarse);
auto cip_ = cip.View();
auto ip_ = ip.View();
auto cip_ = cip.View(AcceleratorWrite);
auto ip_ = ip.View(AcceleratorRead);
accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
coalescedWrite(cip_[sc], ip_(sc)());
});
@@ -470,8 +469,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
Coordinate rdt = Tg->_rdimensions;
Coordinate ist = Tg->_istride;
Coordinate ost = Tg->_ostride;
auto t_v = To.View();
auto f_v = From.View();
auto t_v = To.View(AcceleratorWrite);
auto f_v = From.View(AcceleratorRead);
accelerator_for(idx,Fg->lSites(),1,{
sobj s;
Coordinate Fcoor(nd);
@@ -718,7 +717,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
}
//loop over outer index
auto in_v = in.View();
auto in_v = in.View(CpuRead);
thread_for(in_oidx,in_grid->oSites(),{
//Assemble vector of pointers to output elements
ExtractPointerArray<sobj> out_ptrs(in_nsimd);
@@ -811,7 +810,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
icoor[lane].resize(ndim);
grid->iCoorFromIindex(icoor[lane],lane);
}
auto out_v = out.View();
auto out_v = out.View(CpuWrite);
thread_for(oidx, grid->oSites(),{
//Assemble vector of pointers to output elements
ExtractPointerArray<sobj> ptrs(nsimd);
@@ -914,7 +913,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
std::vector<SobjOut> in_slex_conv(in_grid->lSites());
unvectorizeToLexOrdArray(in_slex_conv, in);
auto out_v = out.View();
auto out_v = out.View(CpuWrite);
thread_for(out_oidx,out_grid->oSites(),{
Coordinate out_ocoor(ndim);
out_grid->oCoorFromOindex(out_ocoor, out_oidx);