1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

Faster blockProject blockPromote

This commit is contained in:
Peter Boyle 2023-10-24 10:49:55 -04:00
parent 24b6ee0df9
commit aa5047a9e4

View File

@ -276,18 +276,33 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
autoView( coarseData_ , coarseData, AcceleratorWrite); autoView( coarseData_ , coarseData, AcceleratorWrite);
autoView( ip_ , ip, AcceleratorWrite); autoView( ip_ , ip, AcceleratorWrite);
RealD t_IP=0;
RealD t_co=0;
RealD t_za=0;
for(int v=0;v<nbasis;v++) { for(int v=0;v<nbasis;v++) {
t_IP-=usecond();
blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine> blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
t_IP+=usecond();
t_co-=usecond();
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
convertType(coarseData_[sc](v),ip_[sc]); convertType(coarseData_[sc](v),ip_[sc]);
}); });
t_co+=usecond();
// improve numerical stability of projection // improve numerical stability of projection
// |fine> = |fine> - <basis|fine> |basis> // |fine> = |fine> - <basis|fine> |basis>
ip=-ip; ip=-ip;
t_za-=usecond();
blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);
t_za+=usecond();
} }
// std::cout << GridLogPerformance << " blockProject : blockInnerProduct : "<<t_IP<<" us"<<std::endl;
// std::cout << GridLogPerformance << " blockProject : conv : "<<t_co<<" us"<<std::endl;
// std::cout << GridLogPerformance << " blockProject : blockZaxpy : "<<t_za<<" us"<<std::endl;
} }
// This only minimises data motion from CPU to GPU
// there is chance of better implementation that does a vxk loop of inner products to data share
// at the GPU thread level
template<class vobj,class CComplex,int nbasis,class VLattice> template<class vobj,class CComplex,int nbasis,class VLattice>
inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData, inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
const std::vector<Lattice<vobj>> &fineData, const std::vector<Lattice<vobj>> &fineData,
@ -393,8 +408,15 @@ template<class vobj,class CComplex>
Lattice<dotp> coarse_inner(coarse); Lattice<dotp> coarse_inner(coarse);
// Precision promotion // Precision promotion
RealD t;
t=-usecond();
fine_inner = localInnerProductD<vobj>(fineX,fineY); fine_inner = localInnerProductD<vobj>(fineX,fineY);
// t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : localInnerProductD "<<t<<" us"<<std::endl;
t=-usecond();
blockSum(coarse_inner,fine_inner); blockSum(coarse_inner,fine_inner);
// t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : blockSum "<<t<<" us"<<std::endl;
t=-usecond();
{ {
autoView( CoarseInner_ , CoarseInner,AcceleratorWrite); autoView( CoarseInner_ , CoarseInner,AcceleratorWrite);
autoView( coarse_inner_ , coarse_inner,AcceleratorRead); autoView( coarse_inner_ , coarse_inner,AcceleratorRead);
@ -402,6 +424,7 @@ template<class vobj,class CComplex>
convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss])); convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
}); });
} }
// t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : convertType "<<t<<" us"<<std::endl;
} }
@ -444,6 +467,9 @@ inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
template<class vobj> template<class vobj>
inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData) inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
{ {
const int maxsubsec=256;
typedef iVector<vobj,maxsubsec> vSubsec;
GridBase * fine = fineData.Grid(); GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData.Grid(); GridBase * coarse= coarseData.Grid();
@ -471,16 +497,32 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
vobj zz = Zero(); vobj zz = Zero();
accelerator_for(sc,coarse->oSites(),vobj::Nsimd(),{ // Somewhat lazy calculation
// Find the biggest power of two subsection divisor less than or equal to maxsubsec
int subsec=maxsubsec;
int subvol;
subvol=blockVol/subsec;
while(subvol*subsec!=blockVol){
subsec = subsec/2;
subvol=blockVol/subsec;
};
Lattice<vSubsec> coarseTmp(coarse);
autoView( coarseTmp_, coarseTmp, AcceleratorWriteDiscard);
auto coarseTmp_p= &coarseTmp_[0];
// Sum within subsecs in a first kernel
accelerator_for(sce,subsec*coarse->oSites(),vobj::Nsimd(),{
int sc=sce/subsec;
int e=sce%subsec;
// One thread per sub block // One thread per sub block
Coordinate coor_c(_ndimension); Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate
auto cd = coalescedRead(zz); auto cd = coalescedRead(zz);
for(int sb=e*subvol;sb<MIN((e+1)*subvol,blockVol);sb++){
for(int sb=0;sb<blockVol;sb++){
int sf; int sf;
Coordinate coor_b(_ndimension); Coordinate coor_b(_ndimension);
Coordinate coor_f(_ndimension); Coordinate coor_f(_ndimension);
@ -491,9 +533,18 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
cd=cd+coalescedRead(fineData_p[sf]); cd=cd+coalescedRead(fineData_p[sf]);
} }
coalescedWrite(coarseData_p[sc],cd); coalescedWrite(coarseTmp_[sc](e),cd);
}); });
// Sum across subsecs in a second kernel
accelerator_for(sc,coarse->oSites(),vobj::Nsimd(),{
auto cd = coalescedRead(coarseTmp_p[sc](0));
for(int e=1;e<subsec;e++){
cd=cd+coalescedRead(coarseTmp_p[sc](e));
}
coalescedWrite(coarseData_p[sc],cd);
});
return; return;
} }
@ -550,7 +601,7 @@ inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> >
blockOrthonormalize(ip,Basis); blockOrthonormalize(ip,Basis);
} }
#if 0 #ifdef GRID_ACCELERATED
// TODO: CPU optimized version here // TODO: CPU optimized version here
template<class vobj,class CComplex,int nbasis> template<class vobj,class CComplex,int nbasis>
inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData, inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
@ -576,26 +627,37 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
autoView( fineData_ , fineData, AcceleratorWrite); autoView( fineData_ , fineData, AcceleratorWrite);
autoView( coarseData_ , coarseData, AcceleratorRead); autoView( coarseData_ , coarseData, AcceleratorRead);
typedef LatticeView<vobj> Vview;
std::vector<Vview> AcceleratorVecViewContainer_h;
for(int v=0;v<nbasis;v++) {
AcceleratorVecViewContainer_h.push_back(Basis[v].View(AcceleratorRead));
}
static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(nbasis);
acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],nbasis *sizeof(Vview));
auto Basis_p = &AcceleratorVecViewContainer[0];
// Loop with a cache friendly loop ordering // Loop with a cache friendly loop ordering
accelerator_for(sf,fine->oSites(),1,{ Coordinate frdimensions=fine->_rdimensions;
Coordinate crdimensions=coarse->_rdimensions;
accelerator_for(sf,fine->oSites(),vobj::Nsimd(),{
int sc; int sc;
Coordinate coor_c(_ndimension); Coordinate coor_c(_ndimension);
Coordinate coor_f(_ndimension); Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); Lexicographic::CoorFromIndex(coor_f,sf,frdimensions);
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); Lexicographic::IndexFromCoor(coor_c,sc,crdimensions);
for(int i=0;i<nbasis;i++) { auto sum= coarseData_(sc)(0) *Basis_p[0](sf);
/* auto basis_ = Basis[i], );*/ for(int i=1;i<nbasis;i++) sum = sum + coarseData_(sc)(i)*Basis_p[i](sf);
if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]); coalescedWrite(fineData_[sf],sum);
else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
}
}); });
for(int v=0;v<nbasis;v++) {
AcceleratorVecViewContainer_h[v].ViewClose();
}
return; return;
} }
#else #else
// CPU version
template<class vobj,class CComplex,int nbasis,class VLattice> template<class vobj,class CComplex,int nbasis,class VLattice>
inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData, inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
Lattice<vobj> &fineData, Lattice<vobj> &fineData,