From 8bcbd826802e5f0b640dd222ea65b45d83419eee Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 21 Dec 2023 15:21:24 -0500 Subject: [PATCH] BLAS based layout and implementation --- .../GeneralCoarsenedMatrixMultiRHS.h | 396 ++++++++---------- 1 file changed, 178 insertions(+), 218 deletions(-) diff --git a/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h b/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h index e1151707..202a1452 100644 --- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h +++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h @@ -27,8 +27,26 @@ Author: Peter Boyle /* END LEGAL */ #pragma once +#include + NAMESPACE_BEGIN(Grid); + +// Move this to accelerator.h +// Also give a copy device. +// Rename acceleratorPut +// Rename acceleratorGet +template void deviceSet(T& dev,T&host) +{ + acceleratorCopyToDevice(&host,&dev,sizeof(T)); +} +template T deviceGet(T& dev) +{ + T host; + acceleratorCopyFromDevice(&dev,&host,sizeof(T)); + return host; +} + // Fine Object == (per site) type of fine field // nbasis == number of deflation vectors template @@ -40,6 +58,7 @@ public: typedef iVector siteVector; typedef iMatrix siteMatrix; + typedef iVector calcVector; typedef iMatrix calcMatrix; typedef Lattice > CoarseComplexField; typedef Lattice CoarseVector; @@ -59,10 +78,14 @@ public: NonLocalStencilGeometry geom; PaddedCell Cell; GeneralLocalStencil Stencil; - - std::vector > _A; - std::vector MultTemporaries; - deviceVector StencilMasked; + + deviceVector BLAS_B; + deviceVector BLAS_C; + std::vector > BLAS_A; + + std::vector > BLAS_AP; + std::vector > BLAS_BP; + deviceVector BLAS_CP; /////////////////////// // Interface @@ -76,58 +99,117 @@ public: _CoarseGridMulti(CoarseGridMulti), geom(_CoarseGridMulti,Op.geom.hops,Op.geom.skip+1), Cell(Op.geom.Depth(),_CoarseGridMulti), - Stencil(Cell.grids.back(),geom.shifts) + Stencil(Cell.grids.back(),geom.shifts) // padded cell stencil { - _A.resize(geom.npoint); - int32_t padded_sites = _Op._A[0].Grid()->lSites(); + int32_t padded_sites = _Op._A[0].Grid()->lSites(); + int32_t unpadded_sites = _CoarseGrid->lSites(); + + int32_t nrhs = CoarseGridMulti->FullDimensions()[0]; // # RHS + int32_t orhs = nrhs/CComplex::Nsimd(); + + ///////////////////////////////////////////////// + // Device data vector storage + ///////////////////////////////////////////////// + BLAS_A.resize(geom.npoint); for(int p=0;plSites()<<" coarse sites "<<_Op._A[0].Grid()->lSites() <oSites()*geom.npoint); - std::vector StencilTmp; + BLAS_AP.resize(geom.npoint); + BLAS_BP.resize(geom.npoint); + for(int p=0;p lSite + // std::cout << " B ptr "<< nbr<<"/"<oSites()<oSites()*geom.npoint==StencilTmp.size()); - acceleratorCopyToDevice(&StencilTmp[0],&StencilMasked[0],sizeof(GeneralStencilEntryReordered)*StencilTmp.size()); + assert(j==unpadded_sites); CopyMatrix(); } + template void GridtoBLAS(const Lattice &grid,deviceVector &out) + { + std::vector tmp; + unvectorizeToLexOrdArray(tmp,grid); + // std::cout << "GridtoBLAS volume " <lSites()<<" "<lSites()); + assert(tmp.size()==out.size()); + out.resize(tmp.size()); + acceleratorCopyToDevice(&tmp[0],&out[0],sizeof(typename vobj::scalar_object)*tmp.size()); + } + template void BLAStoGrid(Lattice &grid,deviceVector &in) + { + std::vector tmp; + tmp.resize(in.size()); + // std::cout << "BLAStoGrid volume " <lSites()<lSites()); + acceleratorCopyFromDevice(&in[0],&tmp[0],sizeof(typename vobj::scalar_object)*in.size()); + vectorizeFromLexOrdArray(tmp,grid); + } void CopyMatrix (void) { // Clone "A" to be lexicographic in the physics coords // Use unvectorisetolexordarray // Copy to device - std::vector tmp; for(int p=0;pGlobalDimensions()[0]/Nsimd; + int64_t osites=in.Grid()->oSites(); // unpadded + int64_t unpadded_vol = _CoarseGrid->lSites(); + + flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd(); + bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0] + + 2.0*osites*sizeof(siteVector)*npoint; + + int64_t nrhs =pin.Grid()->GlobalDimensions()[0]; assert(nrhs>=1); -#if 0 - { - tviews-=usecond(); - autoView( in_v , pin, AcceleratorRead); - autoView( out_v , pout, AcceleratorWriteDiscard); - tviews+=usecond(); + std::cout << GridLogMessage << "New Mrhs GridtoBLAS in sizes "<lSites()<<" "<lSites()< AcceleratorViewContainer_h; - std::vector AcceleratorVecViewContainer_h; - - tviews-=usecond(); - for(int p=0;p AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint); - static deviceVector AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint); - - auto Aview_p = &AcceleratorViewContainer[0]; - auto Vview_p = &AcceleratorVecViewContainer[0]; - - tcopy-=usecond(); - acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview)); - acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview)); - tcopy+=usecond(); - - int32_t bound = _A[0].size(); - int64_t osites=pin.Grid()->oSites(); - flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd(); - bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0] - + 2.0*osites*sizeof(siteVector)*npoint; - - // std::cout << " osites "<LocalDimensions()<LocalDimensions()<_permute == 0 ) { - int32_t snbr= SE->_offset; - auto nbr = coalescedReadGeneralPermute(in_v[snbr],SE->_permute,Nd); - auto res = Aview_p[point][ss](0,b)*nbr(0); - for(int bb=1;bb AcceleratorViewContainer_h; - std::vector AcceleratorVecViewContainer_h; - - tviews-=usecond(); - for(int p=0;p AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint); - static deviceVector AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint); - - auto Aview_p = &AcceleratorViewContainer[0]; - auto Vview_p = &AcceleratorVecViewContainer[0]; - - tcopy-=usecond(); - acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview)); - acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview)); - tcopy+=usecond(); - - int32_t bound = _A[0].size(); - int64_t osites=in.Grid()->oSites(); - flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd(); - bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0] - + 2.0*osites*sizeof(siteVector)*npoint; - - // std::cout << " osites "<LocalDimensions()<LocalDimensions()<_input; // site of padded - int32_t snbr= SE->_offset; - // std::cout << " unpadded " << ss<<" padded " << s<< " point "<