mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Massively sped up coarse grid mult, comms
Save 3ms spend (60% of time !) on cudaMalloc !!
This commit is contained in:
parent
5fac47a26d
commit
4341d96bde
@ -63,6 +63,7 @@ public:
|
|||||||
|
|
||||||
std::vector<CoarseMatrix> _A;
|
std::vector<CoarseMatrix> _A;
|
||||||
std::vector<CoarseMatrix> _Adag;
|
std::vector<CoarseMatrix> _Adag;
|
||||||
|
std::vector<CoarseVector> MultTemporaries;
|
||||||
|
|
||||||
///////////////////////
|
///////////////////////
|
||||||
// Interface
|
// Interface
|
||||||
@ -125,11 +126,8 @@ public:
|
|||||||
}
|
}
|
||||||
void Mult (std::vector<CoarseMatrix> &A,const CoarseVector &in, CoarseVector &out)
|
void Mult (std::vector<CoarseMatrix> &A,const CoarseVector &in, CoarseVector &out)
|
||||||
{
|
{
|
||||||
RealD tviews=0;
|
RealD tviews=0; RealD ttot=0; RealD tmult=0; RealD texch=0; RealD text=0; RealD ttemps=0; RealD tcopy=0;
|
||||||
RealD ttot=0;
|
|
||||||
RealD tmult=0;
|
|
||||||
RealD texch=0;
|
|
||||||
RealD text=0;
|
|
||||||
ttot=-usecond();
|
ttot=-usecond();
|
||||||
conformable(CoarseGrid(),in.Grid());
|
conformable(CoarseGrid(),in.Grid());
|
||||||
conformable(in.Grid(),out.Grid());
|
conformable(in.Grid(),out.Grid());
|
||||||
@ -155,24 +153,35 @@ public:
|
|||||||
+ 2.0*osites*sizeof(siteVector)*npoint;
|
+ 2.0*osites*sizeof(siteVector)*npoint;
|
||||||
|
|
||||||
{
|
{
|
||||||
|
tviews-=usecond();
|
||||||
autoView( in_v , pin, AcceleratorRead);
|
autoView( in_v , pin, AcceleratorRead);
|
||||||
autoView( out_v , pout, AcceleratorWriteDiscard);
|
autoView( out_v , pout, AcceleratorWriteDiscard);
|
||||||
autoView( Stencil_v , Stencil, AcceleratorRead);
|
autoView( Stencil_v , Stencil, AcceleratorRead);
|
||||||
|
tviews+=usecond();
|
||||||
|
|
||||||
// Static and prereserve to keep UVM region live and not resized across multiple calls
|
// Static and prereserve to keep UVM region live and not resized across multiple calls
|
||||||
Vector<Aview> AcceleratorViewContainer; AcceleratorViewContainer.reserve(npoint);
|
ttemps-=usecond();
|
||||||
Vector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.reserve(npoint);
|
MultTemporaries.resize(npoint,pin.Grid());
|
||||||
std::vector<CoarseVector> outp(npoint,pin.Grid());
|
ttemps+=usecond();
|
||||||
|
std::vector<Aview> AcceleratorViewContainer_h;
|
||||||
|
std::vector<Vview> AcceleratorVecViewContainer_h;
|
||||||
|
|
||||||
tviews-=usecond();
|
tviews-=usecond();
|
||||||
for(int p=0;p<npoint;p++) {
|
for(int p=0;p<npoint;p++) {
|
||||||
AcceleratorViewContainer.push_back( A[p].View(AcceleratorRead));
|
AcceleratorViewContainer_h.push_back( A[p].View(AcceleratorRead));
|
||||||
AcceleratorVecViewContainer.push_back(outp[p].View(AcceleratorWrite));
|
AcceleratorVecViewContainer_h.push_back(MultTemporaries[p].View(AcceleratorWrite));
|
||||||
}
|
}
|
||||||
tviews+=usecond();
|
tviews+=usecond();
|
||||||
|
|
||||||
|
static deviceVector<Aview> AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint);
|
||||||
|
static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint);
|
||||||
|
|
||||||
auto Aview_p = &AcceleratorViewContainer[0];
|
auto Aview_p = &AcceleratorViewContainer[0];
|
||||||
auto Vview_p = &AcceleratorVecViewContainer[0];
|
auto Vview_p = &AcceleratorVecViewContainer[0];
|
||||||
|
tcopy-=usecond();
|
||||||
|
acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview));
|
||||||
|
acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview));
|
||||||
|
tcopy+=usecond();
|
||||||
|
|
||||||
tmult-=usecond();
|
tmult-=usecond();
|
||||||
accelerator_for(spb, osites*nbasis*npoint, Nsimd, {
|
accelerator_for(spb, osites*nbasis*npoint, Nsimd, {
|
||||||
@ -200,8 +209,8 @@ public:
|
|||||||
});
|
});
|
||||||
tmult+=usecond();
|
tmult+=usecond();
|
||||||
for(int p=0;p<npoint;p++) {
|
for(int p=0;p<npoint;p++) {
|
||||||
AcceleratorViewContainer[p].ViewClose();
|
AcceleratorViewContainer_h[p].ViewClose();
|
||||||
AcceleratorVecViewContainer[p].ViewClose();
|
AcceleratorVecViewContainer_h[p].ViewClose();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -214,12 +223,14 @@ public:
|
|||||||
std::cout << GridLogPerformance<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
|
std::cout << GridLogPerformance<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
|
||||||
std::cout << GridLogPerformance<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
|
std::cout << GridLogPerformance<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
|
||||||
std::cout << GridLogPerformance<<"Coarse Mult ext "<<text<<" us"<<std::endl;
|
std::cout << GridLogPerformance<<"Coarse Mult ext "<<text<<" us"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance<<"Coarse Mult temps "<<ttemps<<" us"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance<<"Coarse Mult copy "<<tcopy<<" us"<<std::endl;
|
||||||
std::cout << GridLogPerformance<<"Coarse Mult tot "<<ttot<<" us"<<std::endl;
|
std::cout << GridLogPerformance<<"Coarse Mult tot "<<ttot<<" us"<<std::endl;
|
||||||
std::cout << GridLogPerformance<<std::endl;
|
// std::cout << GridLogPerformance<<std::endl;
|
||||||
std::cout << GridLogPerformance<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
|
// std::cout << GridLogPerformance<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
|
||||||
std::cout << GridLogPerformance<<"Coarse Kernel bytes/s"<< bytes/tmult<<" MB/s"<<std::endl;
|
// std::cout << GridLogPerformance<<"Coarse Kernel bytes/s"<< bytes/tmult<<" MB/s"<<std::endl;
|
||||||
std::cout << GridLogPerformance<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
|
// std::cout << GridLogPerformance<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
|
||||||
std::cout << GridLogPerformance<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
|
// std::cout << GridLogPerformance<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user