mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-12 07:00:45 +01:00
363 lines
13 KiB
C++
363 lines
13 KiB
C++
/*************************************************************************************
|
|
|
|
Grid physics library, www.github.com/paboyle/Grid
|
|
|
|
Source file: ./lib/algorithms/GeneralCoarsenedMatrixMultiRHS.h
|
|
|
|
Copyright (C) 2015
|
|
|
|
Author: Peter Boyle <pboyle@bnl.gov>
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License along
|
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
|
|
See the full license in the file "LICENSE" in the top level distribution directory
|
|
*************************************************************************************/
|
|
/* END LEGAL */
|
|
#pragma once
|
|
|
|
|
|
NAMESPACE_BEGIN(Grid);
|
|
|
|
|
|
// Fine Object == (per site) type of fine field
|
|
// nbasis == number of deflation vectors
|
|
template<class Fobj,class CComplex,int nbasis>
|
|
class MultiGeneralCoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > {
|
|
public:
|
|
typedef typename CComplex::scalar_object SComplex;
|
|
typedef GeneralCoarsenedMatrix<Fobj,CComplex,nbasis> GeneralCoarseOp;
|
|
typedef MultiGeneralCoarsenedMatrix<Fobj,CComplex,nbasis> MultiGeneralCoarseOp;
|
|
|
|
typedef iVector<CComplex,nbasis > siteVector;
|
|
typedef iMatrix<CComplex,nbasis > siteMatrix;
|
|
typedef iVector<SComplex,nbasis > calcVector;
|
|
typedef iMatrix<SComplex,nbasis > calcMatrix;
|
|
typedef Lattice<iScalar<CComplex> > CoarseComplexField;
|
|
typedef Lattice<siteVector> CoarseVector;
|
|
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
|
|
typedef iMatrix<CComplex,nbasis > Cobj;
|
|
typedef iVector<CComplex,nbasis > Cvec;
|
|
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
|
|
typedef Lattice<Fobj > FineField;
|
|
typedef CoarseVector Field;
|
|
|
|
////////////////////
|
|
// Data members
|
|
////////////////////
|
|
GridCartesian * _CoarseGridMulti;
|
|
GridCartesian * _CoarseGrid;
|
|
GeneralCoarseOp & _Op;
|
|
NonLocalStencilGeometry geom;
|
|
PaddedCell Cell;
|
|
GeneralLocalStencil Stencil;
|
|
|
|
deviceVector<calcVector> BLAS_B;
|
|
deviceVector<calcVector> BLAS_C;
|
|
std::vector<deviceVector<calcMatrix> > BLAS_A;
|
|
|
|
std::vector<deviceVector<ComplexD *> > BLAS_AP;
|
|
std::vector<deviceVector<ComplexD *> > BLAS_BP;
|
|
deviceVector<ComplexD *> BLAS_CP;
|
|
|
|
///////////////////////
|
|
// Interface
|
|
///////////////////////
|
|
GridBase * Grid(void) { return _CoarseGridMulti; }; // this is all the linalg routines need to know
|
|
GridCartesian * CoarseGrid(void) { return _CoarseGridMulti; }; // this is all the linalg routines need to know
|
|
|
|
MultiGeneralCoarsenedMatrix(GeneralCoarseOp & Op,GridCartesian *CoarseGridMulti) :
|
|
_Op(Op),
|
|
_CoarseGrid(Op.CoarseGrid()),
|
|
_CoarseGridMulti(CoarseGridMulti),
|
|
geom(_CoarseGridMulti,Op.geom.hops,Op.geom.skip+1),
|
|
Cell(Op.geom.Depth(),_CoarseGridMulti),
|
|
Stencil(Cell.grids.back(),geom.shifts) // padded cell stencil
|
|
{
|
|
int32_t padded_sites = _Op._A[0].Grid()->lSites();
|
|
int32_t unpadded_sites = _CoarseGrid->lSites();
|
|
|
|
int32_t nrhs = CoarseGridMulti->FullDimensions()[0]; // # RHS
|
|
int32_t orhs = nrhs/CComplex::Nsimd();
|
|
|
|
/////////////////////////////////////////////////
|
|
// Device data vector storage
|
|
/////////////////////////////////////////////////
|
|
BLAS_A.resize(geom.npoint);
|
|
for(int p=0;p<geom.npoint;p++){
|
|
BLAS_A[p].resize (unpadded_sites); // no ghost zone, npoint elements
|
|
}
|
|
BLAS_B.resize(nrhs *padded_sites); // includes ghost zone
|
|
BLAS_C.resize(nrhs *unpadded_sites); // no ghost zone
|
|
|
|
BLAS_AP.resize(geom.npoint);
|
|
BLAS_BP.resize(geom.npoint);
|
|
for(int p=0;p<geom.npoint;p++){
|
|
BLAS_AP[p].resize(unpadded_sites);
|
|
BLAS_BP[p].resize(unpadded_sites);
|
|
}
|
|
BLAS_CP.resize(unpadded_sites);
|
|
|
|
/////////////////////////////////////////////////
|
|
// Pointers to data
|
|
/////////////////////////////////////////////////
|
|
|
|
// Site identity mapping for A, C
|
|
for(int p=0;p<geom.npoint;p++){
|
|
for(int ss=0;ss<unpadded_sites;ss++){
|
|
ComplexD *ptr = (ComplexD *)&BLAS_A[p][ss];
|
|
acceleratorPut(BLAS_AP[p][ss],ptr);
|
|
}
|
|
}
|
|
for(int ss=0;ss<unpadded_sites;ss++){
|
|
ComplexD *ptr = (ComplexD *)&BLAS_C[ss*nrhs];
|
|
acceleratorPut(BLAS_CP[ss],ptr);
|
|
}
|
|
|
|
/////////////////////////////////////////////////
|
|
// Neighbour table is more complicated
|
|
/////////////////////////////////////////////////
|
|
int32_t j=0; // Interior point counter (unpadded)
|
|
for(int32_t s=0;s<padded_sites;s++){ // 4 volume, padded
|
|
int ghost_zone=0;
|
|
for(int32_t point = 0 ; point < geom.npoint; point++){
|
|
int i=s*orhs*geom.npoint+point;
|
|
if( Stencil._entries[i]._wrap ) { // stencil is indexed by the oSite of the CoarseGridMulti, hence orhs factor
|
|
ghost_zone=1; // If general stencil wrapped in any direction, wrap=1
|
|
}
|
|
}
|
|
|
|
if( ghost_zone==0) {
|
|
for(int32_t point = 0 ; point < geom.npoint; point++){
|
|
int i=s*orhs*geom.npoint+point;
|
|
int32_t nbr = Stencil._entries[i]._offset*CComplex::Nsimd(); // oSite -> lSite
|
|
assert(nbr<BLAS_B.size());
|
|
ComplexD * ptr = (ComplexD *)&BLAS_B[nbr];
|
|
acceleratorPut(BLAS_BP[point][j],ptr); // neighbour indexing in ghost zone volume
|
|
}
|
|
j++;
|
|
}
|
|
}
|
|
assert(j==unpadded_sites);
|
|
CopyMatrix();
|
|
}
|
|
template<class vobj> void GridtoBLAS(const Lattice<vobj> &from,deviceVector<typename vobj::scalar_object> &to)
|
|
{
|
|
#if 0
|
|
std::vector<typename vobj::scalar_object> tmp;
|
|
unvectorizeToLexOrdArray(tmp,from);
|
|
assert(tmp.size()==from.Grid()->lSites());
|
|
assert(tmp.size()==to.size());
|
|
to.resize(tmp.size());
|
|
acceleratorCopyToDevice(&tmp[0],&to[0],sizeof(typename vobj::scalar_object)*tmp.size());
|
|
#else
|
|
typedef typename vobj::scalar_object sobj;
|
|
typedef typename vobj::scalar_type scalar_type;
|
|
typedef typename vobj::vector_type vector_type;
|
|
|
|
GridBase *Fg = from.Grid();
|
|
assert(!Fg->_isCheckerBoarded);
|
|
int nd = Fg->_ndimension;
|
|
|
|
to.resize(Fg->lSites());
|
|
|
|
Coordinate LocalLatt = Fg->LocalDimensions();
|
|
size_t nsite = 1;
|
|
for(int i=0;i<nd;i++) nsite *= LocalLatt[i];
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
|
// do the index calc on the GPU
|
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
|
Coordinate f_ostride = Fg->_ostride;
|
|
Coordinate f_istride = Fg->_istride;
|
|
Coordinate f_rdimensions = Fg->_rdimensions;
|
|
|
|
autoView(from_v,from,AcceleratorRead);
|
|
auto to_v = &to[0];
|
|
|
|
const int words=sizeof(vobj)/sizeof(vector_type);
|
|
accelerator_for(idx,nsite,1,{
|
|
|
|
Coordinate from_coor, base;
|
|
Lexicographic::CoorFromIndex(base,idx,LocalLatt);
|
|
for(int i=0;i<nd;i++){
|
|
from_coor[i] = base[i];
|
|
}
|
|
int from_oidx = 0; for(int d=0;d<nd;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
|
|
int from_lane = 0; for(int d=0;d<nd;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
|
|
|
|
const vector_type* from = (const vector_type *)&from_v[from_oidx];
|
|
scalar_type* to = (scalar_type *)&to_v[idx];
|
|
|
|
scalar_type stmp;
|
|
for(int w=0;w<words;w++){
|
|
stmp = getlane(from[w], from_lane);
|
|
to[w] = stmp;
|
|
}
|
|
});
|
|
#endif
|
|
}
|
|
template<class vobj> void BLAStoGrid(Lattice<vobj> &grid,deviceVector<typename vobj::scalar_object> &in)
|
|
{
|
|
#if 0
|
|
std::vector<typename vobj::scalar_object> tmp;
|
|
tmp.resize(in.size());
|
|
assert(in.size()==grid.Grid()->lSites());
|
|
acceleratorCopyFromDevice(&in[0],&tmp[0],sizeof(typename vobj::scalar_object)*in.size());
|
|
vectorizeFromLexOrdArray(tmp,grid);
|
|
#else
|
|
typedef typename vobj::scalar_object sobj;
|
|
typedef typename vobj::scalar_type scalar_type;
|
|
typedef typename vobj::vector_type vector_type;
|
|
|
|
GridBase *Tg = grid.Grid();
|
|
assert(!Tg->_isCheckerBoarded);
|
|
int nd = Tg->_ndimension;
|
|
|
|
assert(in.size()==Tg->lSites());
|
|
|
|
Coordinate LocalLatt = Tg->LocalDimensions();
|
|
size_t nsite = 1;
|
|
for(int i=0;i<nd;i++) nsite *= LocalLatt[i];
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
|
// do the index calc on the GPU
|
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
|
Coordinate t_ostride = Tg->_ostride;
|
|
Coordinate t_istride = Tg->_istride;
|
|
Coordinate t_rdimensions = Tg->_rdimensions;
|
|
|
|
autoView(to_v,grid,AcceleratorWrite);
|
|
auto from_v = &in[0];
|
|
|
|
const int words=sizeof(vobj)/sizeof(vector_type);
|
|
accelerator_for(idx,nsite,1,{
|
|
|
|
Coordinate to_coor, base;
|
|
Lexicographic::CoorFromIndex(base,idx,LocalLatt);
|
|
for(int i=0;i<nd;i++){
|
|
to_coor[i] = base[i];
|
|
}
|
|
int to_oidx = 0; for(int d=0;d<nd;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
|
|
int to_lane = 0; for(int d=0;d<nd;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
|
|
|
|
vector_type* to = (vector_type *)&to_v[to_oidx];
|
|
scalar_type* from = (scalar_type *)&from_v[idx];
|
|
|
|
scalar_type stmp;
|
|
for(int w=0;w<words;w++){
|
|
stmp=from[w];
|
|
putlane(to[w], stmp, to_lane);
|
|
}
|
|
});
|
|
#endif
|
|
}
|
|
void CopyMatrix (void)
|
|
{
|
|
for(int p=0;p<geom.npoint;p++){
|
|
//Unpadded
|
|
auto Aup = _Op.Cell.Extract(_Op._A[p]);
|
|
GridtoBLAS(Aup,BLAS_A[p]);
|
|
}
|
|
}
|
|
void Mdag(const CoarseVector &in, CoarseVector &out)
|
|
{
|
|
this->M(in,out);
|
|
}
|
|
void M (const CoarseVector &in, CoarseVector &out)
|
|
{
|
|
// std::cout << GridLogMessage << "New Mrhs coarse"<<std::endl;
|
|
conformable(CoarseGrid(),in.Grid());
|
|
conformable(in.Grid(),out.Grid());
|
|
out.Checkerboard() = in.Checkerboard();
|
|
|
|
RealD t_tot;
|
|
RealD t_exch;
|
|
RealD t_GtoB;
|
|
RealD t_BtoG;
|
|
RealD t_mult;
|
|
|
|
t_tot=-usecond();
|
|
CoarseVector tin=in;
|
|
t_exch=-usecond();
|
|
CoarseVector pin = Cell.ExchangePeriodic(tin); //padded input
|
|
t_exch+=usecond();
|
|
|
|
CoarseVector pout(pin.Grid());
|
|
|
|
int npoint = geom.npoint;
|
|
typedef calcMatrix* Aview;
|
|
typedef LatticeView<Cvec> Vview;
|
|
|
|
const int Nsimd = CComplex::Nsimd();
|
|
|
|
RealD flops,bytes;
|
|
int64_t osites=in.Grid()->oSites(); // unpadded
|
|
int64_t unpadded_vol = _CoarseGrid->lSites();
|
|
|
|
flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
|
|
bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
|
|
+ 2.0*osites*sizeof(siteVector)*npoint;
|
|
|
|
int64_t nrhs =pin.Grid()->GlobalDimensions()[0];
|
|
assert(nrhs>=1);
|
|
|
|
t_GtoB=-usecond();
|
|
GridtoBLAS(pin,BLAS_B);
|
|
t_GtoB+=usecond();
|
|
|
|
GridBLAS BLAS;
|
|
|
|
t_mult=-usecond();
|
|
for(int p=0;p<geom.npoint;p++){
|
|
RealD c = 1.0;
|
|
if (p==0) c = 0.0;
|
|
ComplexD beta(c);
|
|
|
|
BLAS.gemmBatched(nbasis,nrhs,nbasis,
|
|
ComplexD(1.0),
|
|
BLAS_AP[p],
|
|
BLAS_BP[p],
|
|
ComplexD(c),
|
|
BLAS_CP);
|
|
}
|
|
BLAS.synchronise();
|
|
t_mult+=usecond();
|
|
|
|
t_BtoG=-usecond();
|
|
BLAStoGrid(out,BLAS_C);
|
|
t_BtoG+=usecond();
|
|
t_tot+=usecond();
|
|
|
|
std::cout << GridLogMessage << "New Mrhs coarse DONE "<<std::endl;
|
|
std::cout << GridLogMessage<<"Coarse Mult exch "<<t_exch<<" us"<<std::endl;
|
|
std::cout << GridLogMessage<<"Coarse Mult mult "<<t_mult<<" us"<<std::endl;
|
|
std::cout << GridLogMessage<<"Coarse Mult GtoB "<<t_GtoB<<" us"<<std::endl;
|
|
std::cout << GridLogMessage<<"Coarse Mult BtoG "<<t_BtoG<<" us"<<std::endl;
|
|
std::cout << GridLogMessage<<"Coarse Mult tot "<<t_tot<<" us"<<std::endl;
|
|
std::cout << GridLogMessage<<std::endl;
|
|
std::cout << GridLogMessage<<"Coarse Kernel flops "<< flops<<std::endl;
|
|
std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/t_mult<<" mflop/s"<<std::endl;
|
|
std::cout << GridLogMessage<<"Coarse Kernel bytes/s "<< bytes/t_mult/1000<<" GB/s"<<std::endl;
|
|
std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl;
|
|
// std::cout << GridLogMessage<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
|
|
};
|
|
virtual void Mdiag (const Field &in, Field &out){ assert(0);};
|
|
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);};
|
|
virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);};
|
|
|
|
};
|
|
|
|
NAMESPACE_END(Grid);
|