diff --git a/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h b/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h index fc464ea1..1da968bd 100644 --- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h +++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h @@ -85,7 +85,7 @@ public: } std::cout << GridLogMessage<<"MultiGeneralCoarsenedMatrix "<<_CoarseGrid->lSites()<<" coarse sites "<<_Op._A[0].Grid()->lSites() <oSites()); + StencilMasked.resize(_CoarseGridMulti->oSites()*geom.npoint); std::vector StencilTmp; int32_t j=0; @@ -102,16 +102,19 @@ public: if( ghost_zone==0) { for(int32_t point = 0 ; point < geom.npoint; point++){ int i=s*geom.npoint+point; - tmp._offset = Stencil._entries[i]._offset; - tmp._permute= Stencil._entries[i]._permute; - tmp._output = j; + tmp._offset = Stencil._entries[i]._offset; + tmp._permute= Stencil._entries[i]._permute; // Should be no premute and j=site + tmp._input = s; StencilTmp.push_back(tmp); } j++; } } - std::cout << "coarse osites x npoint "<<_CoarseGridMulti->oSites()*geom.npoint<< " stencil interior size "<< StencilTmp.size()<lSites()*geom.npoint==StencilTmp.size()); + + std::cout << " oSites " << _CoarseGridMulti->oSites()<oSites()*geom.npoint==StencilTmp.size()); acceleratorCopyToDevice(&StencilTmp[0],&StencilMasked[0],sizeof(GeneralStencilEntryReordered)*StencilTmp.size()); CopyMatrix(); } @@ -151,25 +154,21 @@ public: typedef LatticeView Vview; const int Nsimd = CComplex::Nsimd(); - - int64_t osites=pin.Grid()->oSites(); + + RealD flops,bytes; int64_t nrhs =pin.Grid()->GlobalDimensions()[0]/Nsimd; assert(nrhs>=1); - RealD flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd(); - RealD bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0] - + 2.0*osites*sizeof(siteVector)*npoint; - +#if 0 { tviews-=usecond(); autoView( in_v , pin, AcceleratorRead); autoView( out_v , pout, AcceleratorWriteDiscard); - autoView( Stencil_v , Stencil, AcceleratorRead); tviews+=usecond(); // Static and prereserve to keep UVM region live and not resized across multiple calls ttemps-=usecond(); - MultTemporaries.resize(npoint,pin.Grid()); + MultTemporaries.resize(npoint,in.Grid()); ttemps+=usecond(); std::vector AcceleratorViewContainer_h; @@ -194,10 +193,16 @@ public: tcopy+=usecond(); int32_t bound = _A[0].size(); + int64_t osites=pin.Grid()->oSites(); + flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd(); + bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0] + + 2.0*osites*sizeof(siteVector)*npoint; + std::cout << " osites "<LocalDimensions()<LocalDimensions()<_permute == 0 ) { + if ( SE->_permute == 0 ) { int32_t snbr= SE->_offset; auto nbr = coalescedReadGeneralPermute(in_v[snbr],SE->_permute,Nd); auto res = Aview_p[point][ss](0,b)*nbr(0); @@ -228,6 +233,7 @@ public: }); tmult2+=usecond(); tmult+=usecond(); + for(int p=0;p AcceleratorViewContainer_h; + std::vector AcceleratorVecViewContainer_h; + + tviews-=usecond(); + for(int p=0;p AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint); + static deviceVector AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint); + + auto Aview_p = &AcceleratorViewContainer[0]; + auto Vview_p = &AcceleratorVecViewContainer[0]; + + tcopy-=usecond(); + acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview)); + acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview)); + tcopy+=usecond(); + + int32_t bound = _A[0].size(); + int64_t osites=in.Grid()->oSites(); + flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd(); + bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0] + + 2.0*osites*sizeof(siteVector)*npoint; + + std::cout << " osites "<LocalDimensions()<LocalDimensions()<_input; + int32_t snbr= SE->_offset; + std::cout << " unpadded " << ss<<" padded " << s<< " point "<