mirror of
https://github.com/paboyle/Grid.git
synced 2025-08-17 11:41:53 +01:00
Make view specify where and drive data motion - first cut.
This is a compile tiime option --enable-unified=yes/no
This commit is contained in:
@@ -49,8 +49,8 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
|
||||
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
|
||||
half.Checkerboard() = cb;
|
||||
|
||||
auto half_v = half.View();
|
||||
auto full_v = full.View();
|
||||
auto half_v = half.View(CpuWrite);
|
||||
auto full_v = full.View(CpuRead);
|
||||
thread_for(ss, full.Grid()->oSites(),{
|
||||
int cbos;
|
||||
Coordinate coor;
|
||||
@@ -65,8 +65,8 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
|
||||
}
|
||||
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
|
||||
int cb = half.Checkerboard();
|
||||
auto half_v = half.View();
|
||||
auto full_v = full.View();
|
||||
auto half_v = half.View(CpuRead);
|
||||
auto full_v = full.View(CpuWrite);
|
||||
thread_for(ss,full.Grid()->oSites(),{
|
||||
|
||||
Coordinate coor;
|
||||
@@ -92,9 +92,8 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||
|
||||
Lattice<CComplex> ip(coarse);
|
||||
|
||||
// auto fineData_ = fineData.View();
|
||||
auto coarseData_ = coarseData.View();
|
||||
auto ip_ = ip.View();
|
||||
auto coarseData_ = coarseData.View(AcceleratorWrite);
|
||||
auto ip_ = ip.View(AcceleratorWrite);
|
||||
for(int v=0;v<nbasis;v++) {
|
||||
blockInnerProduct(ip,Basis[v],fineData);
|
||||
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
|
||||
@@ -102,7 +101,7 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
template<class vobj,class CComplex,int nbasis>
|
||||
inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||
const Lattice<vobj> &fineData,
|
||||
@@ -132,8 +131,8 @@ inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||
|
||||
coarseData=Zero();
|
||||
|
||||
auto fineData_ = fineData.View();
|
||||
auto coarseData_ = coarseData.View();
|
||||
auto fineData_ = fineData.View(AcceleratorRead);
|
||||
auto coarseData_ = coarseData.View(AcceleratorWrite);
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// To make this lock free, loop over coars parallel, and then loop over fine associated with coarse.
|
||||
// Otherwise do fine inner product per site, and make the update atomic
|
||||
@@ -142,7 +141,7 @@ inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||
|
||||
auto sc=sci/nbasis;
|
||||
auto i=sci%nbasis;
|
||||
auto Basis_ = Basis[i].View();
|
||||
auto Basis_ = Basis[i].View(AcceleratorRead);
|
||||
|
||||
Coordinate coor_c(_ndimension);
|
||||
Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate
|
||||
@@ -165,6 +164,7 @@ inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||
});
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
template<class vobj,class CComplex>
|
||||
inline void blockZAXPY(Lattice<vobj> &fineZ,
|
||||
@@ -191,10 +191,10 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
|
||||
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
|
||||
}
|
||||
|
||||
auto fineZ_ = fineZ.View();
|
||||
auto fineX_ = fineX.View();
|
||||
auto fineY_ = fineY.View();
|
||||
auto coarseA_= coarseA.View();
|
||||
auto fineZ_ = fineZ.View(AcceleratorWrite);
|
||||
auto fineX_ = fineX.View(AcceleratorRead);
|
||||
auto fineY_ = fineY.View(AcceleratorRead);
|
||||
auto coarseA_= coarseA.View(AcceleratorRead);
|
||||
|
||||
accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
|
||||
|
||||
@@ -227,11 +227,10 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
|
||||
Lattice<dotp> coarse_inner(coarse);
|
||||
|
||||
// Precision promotion?
|
||||
auto CoarseInner_ = CoarseInner.View();
|
||||
auto coarse_inner_ = coarse_inner.View();
|
||||
|
||||
fine_inner = localInnerProduct(fineX,fineY);
|
||||
blockSum(coarse_inner,fine_inner);
|
||||
auto CoarseInner_ = CoarseInner.View(AcceleratorWrite);
|
||||
auto coarse_inner_ = coarse_inner.View(AcceleratorRead);
|
||||
accelerator_for(ss, coarse->oSites(), 1, {
|
||||
CoarseInner_[ss] = coarse_inner_[ss];
|
||||
});
|
||||
@@ -266,8 +265,8 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
|
||||
|
||||
// Turn this around to loop threaded over sc and interior loop
|
||||
// over sf would thread better
|
||||
auto coarseData_ = coarseData.View();
|
||||
auto fineData_ = fineData.View();
|
||||
auto coarseData_ = coarseData.View(AcceleratorWrite);
|
||||
auto fineData_ = fineData.View(AcceleratorRead);
|
||||
|
||||
accelerator_for(sc,coarse->oSites(),1,{
|
||||
|
||||
@@ -360,8 +359,8 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||
for(int d=0 ; d<_ndimension;d++){
|
||||
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
|
||||
}
|
||||
auto fineData_ = fineData.View();
|
||||
auto coarseData_ = coarseData.View();
|
||||
auto fineData_ = fineData.View(AcceleratorWrite);
|
||||
auto coarseData_ = coarseData.View(AcceleratorRead);
|
||||
|
||||
// Loop with a cache friendly loop ordering
|
||||
accelerator_for(sf,fine->oSites(),1,{
|
||||
@@ -374,7 +373,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
||||
|
||||
for(int i=0;i<nbasis;i++) {
|
||||
auto basis_ = Basis[i].View();
|
||||
/* auto basis_ = Basis[i].View( );*/
|
||||
if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
|
||||
else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
|
||||
}
|
||||
@@ -395,8 +394,8 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||
for(int i=0;i<nbasis;i++) {
|
||||
Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
|
||||
Lattice<CComplex> cip(coarse);
|
||||
auto cip_ = cip.View();
|
||||
auto ip_ = ip.View();
|
||||
auto cip_ = cip.View(AcceleratorWrite);
|
||||
auto ip_ = ip.View(AcceleratorRead);
|
||||
accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
|
||||
coalescedWrite(cip_[sc], ip_(sc)());
|
||||
});
|
||||
@@ -470,8 +469,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
||||
Coordinate rdt = Tg->_rdimensions;
|
||||
Coordinate ist = Tg->_istride;
|
||||
Coordinate ost = Tg->_ostride;
|
||||
auto t_v = To.View();
|
||||
auto f_v = From.View();
|
||||
auto t_v = To.View(AcceleratorWrite);
|
||||
auto f_v = From.View(AcceleratorRead);
|
||||
accelerator_for(idx,Fg->lSites(),1,{
|
||||
sobj s;
|
||||
Coordinate Fcoor(nd);
|
||||
@@ -718,7 +717,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
|
||||
}
|
||||
|
||||
//loop over outer index
|
||||
auto in_v = in.View();
|
||||
auto in_v = in.View(CpuRead);
|
||||
thread_for(in_oidx,in_grid->oSites(),{
|
||||
//Assemble vector of pointers to output elements
|
||||
ExtractPointerArray<sobj> out_ptrs(in_nsimd);
|
||||
@@ -811,7 +810,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
|
||||
icoor[lane].resize(ndim);
|
||||
grid->iCoorFromIindex(icoor[lane],lane);
|
||||
}
|
||||
auto out_v = out.View();
|
||||
auto out_v = out.View(CpuWrite);
|
||||
thread_for(oidx, grid->oSites(),{
|
||||
//Assemble vector of pointers to output elements
|
||||
ExtractPointerArray<sobj> ptrs(nsimd);
|
||||
@@ -914,7 +913,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
|
||||
std::vector<SobjOut> in_slex_conv(in_grid->lSites());
|
||||
unvectorizeToLexOrdArray(in_slex_conv, in);
|
||||
|
||||
auto out_v = out.View();
|
||||
auto out_v = out.View(CpuWrite);
|
||||
thread_for(out_oidx,out_grid->oSites(),{
|
||||
Coordinate out_ocoor(ndim);
|
||||
out_grid->oCoorFromOindex(out_ocoor, out_oidx);
|
||||
|
Reference in New Issue
Block a user