diff --git a/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h b/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h index 2b093923..08d2cde4 100644 --- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h +++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h @@ -63,6 +63,7 @@ public: std::vector _A; std::vector _Adag; + std::vector MultTemporaries; /////////////////////// // Interface @@ -125,11 +126,8 @@ public: } void Mult (std::vector &A,const CoarseVector &in, CoarseVector &out) { - RealD tviews=0; - RealD ttot=0; - RealD tmult=0; - RealD texch=0; - RealD text=0; + RealD tviews=0; RealD ttot=0; RealD tmult=0; RealD texch=0; RealD text=0; RealD ttemps=0; RealD tcopy=0; + ttot=-usecond(); conformable(CoarseGrid(),in.Grid()); conformable(in.Grid(),out.Grid()); @@ -155,25 +153,36 @@ public: + 2.0*osites*sizeof(siteVector)*npoint; { + tviews-=usecond(); autoView( in_v , pin, AcceleratorRead); autoView( out_v , pout, AcceleratorWriteDiscard); autoView( Stencil_v , Stencil, AcceleratorRead); + tviews+=usecond(); // Static and prereserve to keep UVM region live and not resized across multiple calls - Vector AcceleratorViewContainer; AcceleratorViewContainer.reserve(npoint); - Vector AcceleratorVecViewContainer; AcceleratorVecViewContainer.reserve(npoint); - std::vector outp(npoint,pin.Grid()); + ttemps-=usecond(); + MultTemporaries.resize(npoint,pin.Grid()); + ttemps+=usecond(); + std::vector AcceleratorViewContainer_h; + std::vector AcceleratorVecViewContainer_h; tviews-=usecond(); for(int p=0;p AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint); + static deviceVector AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint); + auto Aview_p = &AcceleratorViewContainer[0]; auto Vview_p = &AcceleratorVecViewContainer[0]; - + tcopy-=usecond(); + acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview)); + acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview)); + tcopy+=usecond(); + tmult-=usecond(); accelerator_for(spb, osites*nbasis*npoint, Nsimd, { typedef decltype(coalescedRead(in_v[0](0))) calcComplex; @@ -200,8 +209,8 @@ public: }); tmult+=usecond(); for(int p=0;p