mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-14 01:35:36 +00:00
Compare commits
No commits in common. "ee3b3c4c56cc8f1a5cfa6e1f9a9745e15a339719" and "25f71913b75fb9ef4030f63bcfba38ae5ccc08f5" have entirely different histories.
ee3b3c4c56
...
25f71913b7
@ -59,7 +59,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/lattice/Lattice.h>
|
#include <Grid/lattice/Lattice.h>
|
||||||
#include <Grid/cshift/Cshift.h>
|
#include <Grid/cshift/Cshift.h>
|
||||||
#include <Grid/stencil/Stencil.h>
|
#include <Grid/stencil/Stencil.h>
|
||||||
#include <Grid/stencil/GeneralLocalStencil.h>
|
|
||||||
#include <Grid/parallelIO/BinaryIO.h>
|
#include <Grid/parallelIO/BinaryIO.h>
|
||||||
#include <Grid/algorithms/Algorithms.h>
|
#include <Grid/algorithms/Algorithms.h>
|
||||||
NAMESPACE_CHECK(GridCore)
|
NAMESPACE_CHECK(GridCore)
|
||||||
|
@ -29,9 +29,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifndef GRID_ALGORITHMS_H
|
#ifndef GRID_ALGORITHMS_H
|
||||||
#define GRID_ALGORITHMS_H
|
#define GRID_ALGORITHMS_H
|
||||||
|
|
||||||
NAMESPACE_CHECK(blas);
|
|
||||||
#include <Grid/algorithms/blas/BatchedBlas.h>
|
|
||||||
|
|
||||||
NAMESPACE_CHECK(algorithms);
|
NAMESPACE_CHECK(algorithms);
|
||||||
#include <Grid/algorithms/SparseMatrix.h>
|
#include <Grid/algorithms/SparseMatrix.h>
|
||||||
#include <Grid/algorithms/LinearOperator.h>
|
#include <Grid/algorithms/LinearOperator.h>
|
||||||
@ -47,10 +44,7 @@ NAMESPACE_CHECK(SparseMatrix);
|
|||||||
#include <Grid/algorithms/approx/RemezGeneral.h>
|
#include <Grid/algorithms/approx/RemezGeneral.h>
|
||||||
#include <Grid/algorithms/approx/ZMobius.h>
|
#include <Grid/algorithms/approx/ZMobius.h>
|
||||||
NAMESPACE_CHECK(approx);
|
NAMESPACE_CHECK(approx);
|
||||||
#include <Grid/algorithms/deflation/Deflation.h>
|
#include <Grid/algorithms/iterative/Deflation.h>
|
||||||
#include <Grid/algorithms/deflation/MultiRHSBlockProject.h>
|
|
||||||
#include <Grid/algorithms/deflation/MultiRHSDeflation.h>
|
|
||||||
NAMESPACE_CHECK(deflation);
|
|
||||||
#include <Grid/algorithms/iterative/ConjugateGradient.h>
|
#include <Grid/algorithms/iterative/ConjugateGradient.h>
|
||||||
NAMESPACE_CHECK(ConjGrad);
|
NAMESPACE_CHECK(ConjGrad);
|
||||||
#include <Grid/algorithms/iterative/BiCGSTAB.h>
|
#include <Grid/algorithms/iterative/BiCGSTAB.h>
|
||||||
@ -73,10 +67,11 @@ NAMESPACE_CHECK(BiCGSTAB);
|
|||||||
#include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h>
|
#include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h>
|
||||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
|
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
|
||||||
#include <Grid/algorithms/iterative/PowerMethod.h>
|
#include <Grid/algorithms/iterative/PowerMethod.h>
|
||||||
#include <Grid/algorithms/iterative/AdefGeneric.h>
|
|
||||||
NAMESPACE_CHECK(PowerMethod);
|
NAMESPACE_CHECK(PowerMethod);
|
||||||
#include <Grid/algorithms/multigrid/MultiGrid.h>
|
#include <Grid/algorithms/multigrid/MultiGrid.h>
|
||||||
NAMESPACE_CHECK(multigrid);
|
|
||||||
|
NAMESPACE_CHECK(CoarsendMatrix);
|
||||||
#include <Grid/algorithms/FFT.h>
|
#include <Grid/algorithms/FFT.h>
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -1,157 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#ifndef GRID_DEFLATION_H
|
|
||||||
#define GRID_DEFLATION_H
|
|
||||||
|
|
||||||
namespace Grid {
|
|
||||||
|
|
||||||
template<class Field>
|
|
||||||
class ZeroGuesser: public LinearFunction<Field> {
|
|
||||||
public:
|
|
||||||
using LinearFunction<Field>::operator();
|
|
||||||
virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
|
|
||||||
};
|
|
||||||
template<class Field>
|
|
||||||
class DoNothingGuesser: public LinearFunction<Field> {
|
|
||||||
public:
|
|
||||||
using LinearFunction<Field>::operator();
|
|
||||||
virtual void operator()(const Field &src, Field &guess) { };
|
|
||||||
};
|
|
||||||
template<class Field>
|
|
||||||
class SourceGuesser: public LinearFunction<Field> {
|
|
||||||
public:
|
|
||||||
using LinearFunction<Field>::operator();
|
|
||||||
virtual void operator()(const Field &src, Field &guess) { guess = src; };
|
|
||||||
};
|
|
||||||
|
|
||||||
////////////////////////////////
|
|
||||||
// Fine grid deflation
|
|
||||||
////////////////////////////////
|
|
||||||
template<class Field>
|
|
||||||
class DeflatedGuesser: public LinearFunction<Field> {
|
|
||||||
private:
|
|
||||||
const std::vector<Field> &evec;
|
|
||||||
const std::vector<RealD> &eval;
|
|
||||||
const unsigned int N;
|
|
||||||
|
|
||||||
public:
|
|
||||||
using LinearFunction<Field>::operator();
|
|
||||||
|
|
||||||
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval)
|
|
||||||
: DeflatedGuesser(_evec, _eval, _evec.size())
|
|
||||||
{}
|
|
||||||
|
|
||||||
DeflatedGuesser(const std::vector<Field> & _evec, const std::vector<RealD> & _eval, const unsigned int _N)
|
|
||||||
: evec(_evec), eval(_eval), N(_N)
|
|
||||||
{
|
|
||||||
assert(evec.size()==eval.size());
|
|
||||||
assert(N <= evec.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual void operator()(const Field &src,Field &guess) {
|
|
||||||
guess = Zero();
|
|
||||||
for (int i=0;i<N;i++) {
|
|
||||||
const Field& tmp = evec[i];
|
|
||||||
axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
|
|
||||||
}
|
|
||||||
guess.Checkerboard() = src.Checkerboard();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template<class FineField, class CoarseField>
|
|
||||||
class LocalCoherenceDeflatedGuesser: public LinearFunction<FineField> {
|
|
||||||
private:
|
|
||||||
const std::vector<FineField> &subspace;
|
|
||||||
const std::vector<CoarseField> &evec_coarse;
|
|
||||||
const std::vector<RealD> &eval_coarse;
|
|
||||||
public:
|
|
||||||
|
|
||||||
using LinearFunction<FineField>::operator();
|
|
||||||
LocalCoherenceDeflatedGuesser(const std::vector<FineField> &_subspace,
|
|
||||||
const std::vector<CoarseField> &_evec_coarse,
|
|
||||||
const std::vector<RealD> &_eval_coarse)
|
|
||||||
: subspace(_subspace),
|
|
||||||
evec_coarse(_evec_coarse),
|
|
||||||
eval_coarse(_eval_coarse)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
void operator()(const FineField &src,FineField &guess) {
|
|
||||||
int N = (int)evec_coarse.size();
|
|
||||||
CoarseField src_coarse(evec_coarse[0].Grid());
|
|
||||||
CoarseField guess_coarse(evec_coarse[0].Grid()); guess_coarse = Zero();
|
|
||||||
blockProject(src_coarse,src,subspace);
|
|
||||||
for (int i=0;i<N;i++) {
|
|
||||||
const CoarseField & tmp = evec_coarse[i];
|
|
||||||
axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
|
|
||||||
}
|
|
||||||
blockPromote(guess_coarse,guess,subspace);
|
|
||||||
guess.Checkerboard() = src.Checkerboard();
|
|
||||||
};
|
|
||||||
|
|
||||||
void operator()(const std::vector<FineField> &src,std::vector<FineField> &guess) {
|
|
||||||
int Nevec = (int)evec_coarse.size();
|
|
||||||
int Nsrc = (int)src.size();
|
|
||||||
// make temp variables
|
|
||||||
std::vector<CoarseField> src_coarse(Nsrc,evec_coarse[0].Grid());
|
|
||||||
std::vector<CoarseField> guess_coarse(Nsrc,evec_coarse[0].Grid());
|
|
||||||
//Preporcessing
|
|
||||||
std::cout << GridLogMessage << "Start BlockProject for loop" << std::endl;
|
|
||||||
for (int j=0;j<Nsrc;j++)
|
|
||||||
{
|
|
||||||
guess_coarse[j] = Zero();
|
|
||||||
std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
|
|
||||||
blockProject(src_coarse[j],src[j],subspace);
|
|
||||||
}
|
|
||||||
//deflation set up for eigen vector batchsize 1 and source batch size equal number of sources
|
|
||||||
std::cout << GridLogMessage << "Start ProjectAccum for loop" << std::endl;
|
|
||||||
for (int i=0;i<Nevec;i++)
|
|
||||||
{
|
|
||||||
std::cout << GridLogMessage << "ProjectAccum Nvec: " << i << std::endl;
|
|
||||||
const CoarseField & tmp = evec_coarse[i];
|
|
||||||
for (int j=0;j<Nsrc;j++)
|
|
||||||
{
|
|
||||||
axpy(guess_coarse[j],TensorRemove(innerProduct(tmp,src_coarse[j])) / eval_coarse[i],tmp,guess_coarse[j]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
//postprocessing
|
|
||||||
std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
|
|
||||||
for (int j=0;j<Nsrc;j++)
|
|
||||||
{
|
|
||||||
std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
|
|
||||||
blockPromote(guess_coarse[j],guess[j],subspace);
|
|
||||||
guess[j].Checkerboard() = src[j].Checkerboard();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
#endif
|
|
@ -1,512 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: MultiRHSDeflation.h
|
|
||||||
|
|
||||||
Copyright (C) 2023
|
|
||||||
|
|
||||||
Author: Peter Boyle <pboyle@bnl.gov>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
MultiRHS block projection
|
|
||||||
|
|
||||||
Import basis -> nblock x nbasis x (block x internal)
|
|
||||||
Import vector of fine lattice objects -> nblock x nrhs x (block x internal)
|
|
||||||
|
|
||||||
=> coarse_(nrhs x nbasis )^block = via batched GEMM
|
|
||||||
|
|
||||||
//template<class vobj,class CComplex,int nbasis,class VLattice>
|
|
||||||
//inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|
||||||
// const VLattice &fineData,
|
|
||||||
// const VLattice &Basis)
|
|
||||||
*/
|
|
||||||
|
|
||||||
template<class Field>
|
|
||||||
class MultiRHSBlockProject
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
|
|
||||||
typedef typename Field::scalar_type scalar;
|
|
||||||
typedef typename Field::scalar_object scalar_object;
|
|
||||||
typedef Field Fermion;
|
|
||||||
|
|
||||||
int nbasis;
|
|
||||||
GridBase *coarse_grid;
|
|
||||||
GridBase *fine_grid;
|
|
||||||
uint64_t block_vol;
|
|
||||||
uint64_t fine_vol;
|
|
||||||
uint64_t coarse_vol;
|
|
||||||
uint64_t words;
|
|
||||||
|
|
||||||
// Row major layout "C" order:
|
|
||||||
// BLAS_V[coarse_vol][nbasis][block_vol][words]
|
|
||||||
// BLAS_F[coarse_vol][nrhs][block_vol][words]
|
|
||||||
// BLAS_C[coarse_vol][nrhs][nbasis]
|
|
||||||
/*
|
|
||||||
* in Fortran column major notation (cuBlas order)
|
|
||||||
*
|
|
||||||
* Vxb = [v1(x)][..][vn(x)] ... x coarse vol
|
|
||||||
*
|
|
||||||
* Fxr = [r1(x)][..][rm(x)] ... x coarse vol
|
|
||||||
*
|
|
||||||
* Block project:
|
|
||||||
* C_br = V^dag F x coarse vol
|
|
||||||
*
|
|
||||||
* Block promote:
|
|
||||||
* F_xr = Vxb Cbr x coarse_vol
|
|
||||||
*/
|
|
||||||
deviceVector<scalar> BLAS_V; // words * block_vol * nbasis x coarse_vol
|
|
||||||
deviceVector<scalar> BLAS_F; // nrhs x fine_vol * words -- the sources
|
|
||||||
deviceVector<scalar> BLAS_C; // nrhs x coarse_vol * nbasis -- the coarse coeffs
|
|
||||||
|
|
||||||
RealD blasNorm2(deviceVector<scalar> &blas)
|
|
||||||
{
|
|
||||||
scalar ss(0.0);
|
|
||||||
std::vector<scalar> tmp(blas.size());
|
|
||||||
acceleratorCopyFromDevice(&blas[0],&tmp[0],blas.size()*sizeof(scalar));
|
|
||||||
for(int64_t s=0;s<blas.size();s++){
|
|
||||||
ss=ss+tmp[s]*adj(tmp[s]);
|
|
||||||
}
|
|
||||||
coarse_grid->GlobalSum(ss);
|
|
||||||
return real(ss);
|
|
||||||
}
|
|
||||||
|
|
||||||
MultiRHSBlockProject(){};
|
|
||||||
~MultiRHSBlockProject(){ Deallocate(); };
|
|
||||||
|
|
||||||
void Deallocate(void)
|
|
||||||
{
|
|
||||||
nbasis=0;
|
|
||||||
coarse_grid=nullptr;
|
|
||||||
fine_grid=nullptr;
|
|
||||||
fine_vol=0;
|
|
||||||
block_vol=0;
|
|
||||||
coarse_vol=0;
|
|
||||||
words=0;
|
|
||||||
BLAS_V.resize(0);
|
|
||||||
BLAS_F.resize(0);
|
|
||||||
BLAS_C.resize(0);
|
|
||||||
}
|
|
||||||
void Allocate(int _nbasis,GridBase *_fgrid,GridBase *_cgrid)
|
|
||||||
{
|
|
||||||
nbasis=_nbasis;
|
|
||||||
|
|
||||||
fine_grid=_fgrid;
|
|
||||||
coarse_grid=_cgrid;
|
|
||||||
|
|
||||||
fine_vol = fine_grid->lSites();
|
|
||||||
coarse_vol = coarse_grid->lSites();
|
|
||||||
block_vol = fine_vol/coarse_vol;
|
|
||||||
|
|
||||||
words = sizeof(scalar_object)/sizeof(scalar);
|
|
||||||
|
|
||||||
BLAS_V.resize (fine_vol * words * nbasis );
|
|
||||||
}
|
|
||||||
void ImportFineGridVectors(std::vector <Field > &vecs, deviceVector<scalar> &blas)
|
|
||||||
{
|
|
||||||
int nvec = vecs.size();
|
|
||||||
typedef typename Field::vector_object vobj;
|
|
||||||
std::cout << " BlockProjector importing "<<nvec<< " vectors" <<std::endl;
|
|
||||||
|
|
||||||
assert(vecs[0].Grid()==fine_grid);
|
|
||||||
|
|
||||||
subdivides(coarse_grid,fine_grid); // require they map
|
|
||||||
|
|
||||||
int _ndimension = coarse_grid->_ndimension;
|
|
||||||
assert(block_vol == fine_grid->oSites() / coarse_grid->oSites());
|
|
||||||
|
|
||||||
Coordinate block_r (_ndimension);
|
|
||||||
for(int d=0 ; d<_ndimension;d++){
|
|
||||||
block_r[d] = fine_grid->_rdimensions[d] / coarse_grid->_rdimensions[d];
|
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t sz = blas.size();
|
|
||||||
|
|
||||||
acceleratorMemSet(&blas[0],0,blas.size()*sizeof(scalar));
|
|
||||||
|
|
||||||
Coordinate fine_rdimensions = fine_grid->_rdimensions;
|
|
||||||
Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
|
|
||||||
int64_t bv= block_vol;
|
|
||||||
for(int v=0;v<vecs.size();v++){
|
|
||||||
|
|
||||||
// std::cout << " BlockProjector importing vector"<<v<<" "<<norm2(vecs[v])<<std::endl;
|
|
||||||
autoView( fineData , vecs[v], AcceleratorRead);
|
|
||||||
|
|
||||||
auto blasData_p = &blas[0];
|
|
||||||
auto fineData_p = &fineData[0];
|
|
||||||
|
|
||||||
int64_t osites = fine_grid->oSites();
|
|
||||||
|
|
||||||
// loop over fine sites
|
|
||||||
const int Nsimd = vobj::Nsimd();
|
|
||||||
// std::cout << "sz "<<sz<<std::endl;
|
|
||||||
// std::cout << "prod "<<Nsimd * coarse_grid->oSites() * block_vol * nvec * words<<std::endl;
|
|
||||||
assert(sz == Nsimd * coarse_grid->oSites() * block_vol * nvec * words);
|
|
||||||
uint64_t lwords= words; // local variable for copy in to GPU
|
|
||||||
accelerator_for(sf,osites,Nsimd,{
|
|
||||||
#ifdef GRID_SIMT
|
|
||||||
{
|
|
||||||
int lane=acceleratorSIMTlane(Nsimd); // buffer lane
|
|
||||||
#else
|
|
||||||
for(int lane=0;lane<Nsimd;lane++) {
|
|
||||||
#endif
|
|
||||||
// One thread per fine site
|
|
||||||
Coordinate coor_f(_ndimension);
|
|
||||||
Coordinate coor_b(_ndimension);
|
|
||||||
Coordinate coor_c(_ndimension);
|
|
||||||
|
|
||||||
// Fine site to fine coor
|
|
||||||
Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions);
|
|
||||||
|
|
||||||
for(int d=0;d<_ndimension;d++) coor_b[d] = coor_f[d]%block_r[d];
|
|
||||||
for(int d=0;d<_ndimension;d++) coor_c[d] = coor_f[d]/block_r[d];
|
|
||||||
|
|
||||||
int sc;// coarse site
|
|
||||||
int sb;// block site
|
|
||||||
Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions);
|
|
||||||
Lexicographic::IndexFromCoor(coor_b,sb,block_r);
|
|
||||||
|
|
||||||
scalar_object data = extractLane(lane,fineData[sf]);
|
|
||||||
|
|
||||||
// BLAS layout address calculation
|
|
||||||
// words * block_vol * nbasis x coarse_vol
|
|
||||||
// coarse oSite x block vole x lanes
|
|
||||||
int64_t site = (lane*osites + sc*bv)*nvec
|
|
||||||
+ v*bv
|
|
||||||
+ sb;
|
|
||||||
|
|
||||||
// assert(site*lwords<sz);
|
|
||||||
|
|
||||||
scalar_object * ptr = (scalar_object *)&blasData_p[site*lwords];
|
|
||||||
|
|
||||||
*ptr = data;
|
|
||||||
#ifdef GRID_SIMT
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
});
|
|
||||||
// std::cout << " import fine Blas norm "<<blasNorm2(blas)<<std::endl;
|
|
||||||
// std::cout << " BlockProjector imported vector"<<v<<std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void ExportFineGridVectors(std::vector <Field> &vecs, deviceVector<scalar> &blas)
|
|
||||||
{
|
|
||||||
typedef typename Field::vector_object vobj;
|
|
||||||
|
|
||||||
int nvec = vecs.size();
|
|
||||||
|
|
||||||
assert(vecs[0].Grid()==fine_grid);
|
|
||||||
|
|
||||||
subdivides(coarse_grid,fine_grid); // require they map
|
|
||||||
|
|
||||||
int _ndimension = coarse_grid->_ndimension;
|
|
||||||
assert(block_vol == fine_grid->oSites() / coarse_grid->oSites());
|
|
||||||
|
|
||||||
Coordinate block_r (_ndimension);
|
|
||||||
for(int d=0 ; d<_ndimension;d++){
|
|
||||||
block_r[d] = fine_grid->_rdimensions[d] / coarse_grid->_rdimensions[d];
|
|
||||||
}
|
|
||||||
Coordinate fine_rdimensions = fine_grid->_rdimensions;
|
|
||||||
Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
|
|
||||||
|
|
||||||
// std::cout << " export fine Blas norm "<<blasNorm2(blas)<<std::endl;
|
|
||||||
|
|
||||||
int64_t bv= block_vol;
|
|
||||||
for(int v=0;v<vecs.size();v++){
|
|
||||||
|
|
||||||
autoView( fineData , vecs[v], AcceleratorWrite);
|
|
||||||
|
|
||||||
auto blasData_p = &blas[0];
|
|
||||||
auto fineData_p = &fineData[0];
|
|
||||||
|
|
||||||
int64_t osites = fine_grid->oSites();
|
|
||||||
uint64_t lwords = words;
|
|
||||||
// std::cout << " Nsimd is "<<vobj::Nsimd() << std::endl;
|
|
||||||
// std::cout << " lwords is "<<lwords << std::endl;
|
|
||||||
// std::cout << " sizeof(scalar_object) is "<<sizeof(scalar_object) << std::endl;
|
|
||||||
// loop over fine sites
|
|
||||||
accelerator_for(sf,osites,vobj::Nsimd(),{
|
|
||||||
|
|
||||||
#ifdef GRID_SIMT
|
|
||||||
{
|
|
||||||
int lane=acceleratorSIMTlane(vobj::Nsimd()); // buffer lane
|
|
||||||
#else
|
|
||||||
for(int lane=0;lane<vobj::Nsimd();lane++) {
|
|
||||||
#endif
|
|
||||||
// One thread per fine site
|
|
||||||
Coordinate coor_f(_ndimension);
|
|
||||||
Coordinate coor_b(_ndimension);
|
|
||||||
Coordinate coor_c(_ndimension);
|
|
||||||
|
|
||||||
Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions);
|
|
||||||
|
|
||||||
for(int d=0;d<_ndimension;d++) coor_b[d] = coor_f[d]%block_r[d];
|
|
||||||
for(int d=0;d<_ndimension;d++) coor_c[d] = coor_f[d]/block_r[d];
|
|
||||||
|
|
||||||
int sc;
|
|
||||||
int sb;
|
|
||||||
Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions);
|
|
||||||
Lexicographic::IndexFromCoor(coor_b,sb,block_r);
|
|
||||||
|
|
||||||
// BLAS layout address calculation
|
|
||||||
// words * block_vol * nbasis x coarse_vol
|
|
||||||
int64_t site = (lane*osites + sc*bv)*nvec
|
|
||||||
+ v*bv
|
|
||||||
+ sb;
|
|
||||||
|
|
||||||
scalar_object * ptr = (scalar_object *)&blasData_p[site*lwords];
|
|
||||||
|
|
||||||
scalar_object data = *ptr;
|
|
||||||
|
|
||||||
insertLane(lane,fineData[sf],data);
|
|
||||||
#ifdef GRID_SIMT
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
template<class vobj>
|
|
||||||
void ImportCoarseGridVectors(std::vector <Lattice<vobj> > &vecs, deviceVector<scalar> &blas)
|
|
||||||
{
|
|
||||||
int nvec = vecs.size();
|
|
||||||
typedef typename vobj::scalar_object coarse_scalar_object;
|
|
||||||
|
|
||||||
std::cout << " BlockProjector importing coarse grid "<<nvec<< " vectors" <<std::endl;
|
|
||||||
|
|
||||||
assert(vecs[0].Grid()==coarse_grid);
|
|
||||||
|
|
||||||
int _ndimension = coarse_grid->_ndimension;
|
|
||||||
|
|
||||||
uint64_t sz = blas.size();
|
|
||||||
|
|
||||||
Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
|
|
||||||
|
|
||||||
for(int v=0;v<vecs.size();v++){
|
|
||||||
|
|
||||||
// std::cout << " BlockProjector importing coarse vector"<<v<<" "<<norm2(vecs[v])<<std::endl;
|
|
||||||
autoView( coarseData , vecs[v], AcceleratorRead);
|
|
||||||
|
|
||||||
auto blasData_p = &blas[0];
|
|
||||||
auto coarseData_p = &coarseData[0];
|
|
||||||
|
|
||||||
int64_t osites = coarse_grid->oSites();
|
|
||||||
|
|
||||||
// loop over fine sites
|
|
||||||
const int Nsimd = vobj::Nsimd();
|
|
||||||
uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
|
|
||||||
assert(cwords==nbasis);
|
|
||||||
|
|
||||||
accelerator_for(sc,osites,Nsimd,{
|
|
||||||
#ifdef GRID_SIMT
|
|
||||||
{
|
|
||||||
int lane=acceleratorSIMTlane(Nsimd); // buffer lane
|
|
||||||
#else
|
|
||||||
for(int lane=0;lane<Nsimd;lane++) {
|
|
||||||
#endif
|
|
||||||
// C_br per site
|
|
||||||
int64_t blas_site = (lane*osites + sc)*nvec*cwords + v*cwords;
|
|
||||||
|
|
||||||
coarse_scalar_object data = extractLane(lane,coarseData[sc]);
|
|
||||||
|
|
||||||
coarse_scalar_object * ptr = (coarse_scalar_object *)&blasData_p[blas_site];
|
|
||||||
|
|
||||||
*ptr = data;
|
|
||||||
#ifdef GRID_SIMT
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
});
|
|
||||||
// std::cout << " import coarsee Blas norm "<<blasNorm2(blas)<<std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
template<class vobj>
|
|
||||||
void ExportCoarseGridVectors(std::vector <Lattice<vobj> > &vecs, deviceVector<scalar> &blas)
|
|
||||||
{
|
|
||||||
int nvec = vecs.size();
|
|
||||||
typedef typename vobj::scalar_object coarse_scalar_object;
|
|
||||||
std::cout << " BlockProjector importing coarse grid "<<nvec<< " vectors" <<std::endl;
|
|
||||||
|
|
||||||
assert(vecs[0].Grid()==coarse_grid);
|
|
||||||
|
|
||||||
int _ndimension = coarse_grid->_ndimension;
|
|
||||||
|
|
||||||
uint64_t sz = blas.size();
|
|
||||||
|
|
||||||
Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
|
|
||||||
|
|
||||||
// std::cout << " export coarsee Blas norm "<<blasNorm2(blas)<<std::endl;
|
|
||||||
for(int v=0;v<vecs.size();v++){
|
|
||||||
|
|
||||||
// std::cout << " BlockProjector exporting coarse vector"<<v<<std::endl;
|
|
||||||
autoView( coarseData , vecs[v], AcceleratorWrite);
|
|
||||||
|
|
||||||
auto blasData_p = &blas[0];
|
|
||||||
auto coarseData_p = &coarseData[0];
|
|
||||||
|
|
||||||
int64_t osites = coarse_grid->oSites();
|
|
||||||
|
|
||||||
// loop over fine sites
|
|
||||||
const int Nsimd = vobj::Nsimd();
|
|
||||||
uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
|
|
||||||
assert(cwords==nbasis);
|
|
||||||
|
|
||||||
accelerator_for(sc,osites,Nsimd,{
|
|
||||||
// Wrap in a macro "FOR_ALL_LANES(lane,{ ... });
|
|
||||||
#ifdef GRID_SIMT
|
|
||||||
{
|
|
||||||
int lane=acceleratorSIMTlane(Nsimd); // buffer lane
|
|
||||||
#else
|
|
||||||
for(int lane=0;lane<Nsimd;lane++) {
|
|
||||||
#endif
|
|
||||||
int64_t blas_site = (lane*osites + sc)*nvec*cwords + v*cwords;
|
|
||||||
coarse_scalar_object * ptr = (coarse_scalar_object *)&blasData_p[blas_site];
|
|
||||||
coarse_scalar_object data = *ptr;
|
|
||||||
insertLane(lane,coarseData[sc],data);
|
|
||||||
#ifdef GRID_SIMT
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void ImportBasis(std::vector < Field > &vecs)
|
|
||||||
{
|
|
||||||
// std::cout << " BlockProjector Import basis size "<<vecs.size()<<std::endl;
|
|
||||||
ImportFineGridVectors(vecs,BLAS_V);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class cobj>
|
|
||||||
void blockProject(std::vector<Field> &fine,std::vector< Lattice<cobj> > & coarse)
|
|
||||||
{
|
|
||||||
int nrhs=fine.size();
|
|
||||||
int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
|
|
||||||
assert(nbasis==_nbasis);
|
|
||||||
|
|
||||||
BLAS_F.resize (fine_vol * words * nrhs );
|
|
||||||
BLAS_C.resize (coarse_vol * nbasis * nrhs );
|
|
||||||
|
|
||||||
/////////////////////////////////////////////
|
|
||||||
// Copy in the multi-rhs sources to same data layout
|
|
||||||
/////////////////////////////////////////////
|
|
||||||
// std::cout << "BlockProject import fine"<<std::endl;
|
|
||||||
ImportFineGridVectors(fine,BLAS_F);
|
|
||||||
|
|
||||||
deviceVector<scalar *> Vd(coarse_vol);
|
|
||||||
deviceVector<scalar *> Fd(coarse_vol);
|
|
||||||
deviceVector<scalar *> Cd(coarse_vol);
|
|
||||||
|
|
||||||
// std::cout << "BlockProject pointers"<<std::endl;
|
|
||||||
for(int c=0;c<coarse_vol;c++){
|
|
||||||
// BLAS_V[coarse_vol][nbasis][block_vol][words]
|
|
||||||
// BLAS_F[coarse_vol][nrhs][block_vol][words]
|
|
||||||
// BLAS_C[coarse_vol][nrhs][nbasis]
|
|
||||||
scalar * Vh = & BLAS_V[c*nbasis*block_vol*words];
|
|
||||||
scalar * Fh = & BLAS_F[c*nrhs*block_vol*words];
|
|
||||||
scalar * Ch = & BLAS_C[c*nrhs*nbasis];
|
|
||||||
|
|
||||||
acceleratorPut(Vd[c],Vh);
|
|
||||||
acceleratorPut(Fd[c],Fh);
|
|
||||||
acceleratorPut(Cd[c],Ch);
|
|
||||||
}
|
|
||||||
|
|
||||||
GridBLAS BLAS;
|
|
||||||
|
|
||||||
// std::cout << "BlockProject BLAS"<<std::endl;
|
|
||||||
int64_t vw = block_vol * words;
|
|
||||||
/////////////////////////////////////////
|
|
||||||
// C_br = V^dag R
|
|
||||||
/////////////////////////////////////////
|
|
||||||
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
|
||||||
nbasis,nrhs,vw,
|
|
||||||
ComplexD(1.0),
|
|
||||||
Vd,
|
|
||||||
Fd,
|
|
||||||
ComplexD(0.0), // wipe out C
|
|
||||||
Cd);
|
|
||||||
BLAS.synchronise();
|
|
||||||
// std::cout << "BlockProject done"<<std::endl;
|
|
||||||
ExportCoarseGridVectors(coarse, BLAS_C);
|
|
||||||
// std::cout << "BlockProject done"<<std::endl;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class cobj>
|
|
||||||
void blockPromote(std::vector<Field> &fine,std::vector<Lattice<cobj> > & coarse)
|
|
||||||
{
|
|
||||||
int nrhs=fine.size();
|
|
||||||
int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
|
|
||||||
assert(nbasis==_nbasis);
|
|
||||||
|
|
||||||
BLAS_F.resize (fine_vol * words * nrhs );
|
|
||||||
BLAS_C.resize (coarse_vol * nbasis * nrhs );
|
|
||||||
|
|
||||||
ImportCoarseGridVectors(coarse, BLAS_C);
|
|
||||||
|
|
||||||
GridBLAS BLAS;
|
|
||||||
|
|
||||||
deviceVector<scalar *> Vd(coarse_vol);
|
|
||||||
deviceVector<scalar *> Fd(coarse_vol);
|
|
||||||
deviceVector<scalar *> Cd(coarse_vol);
|
|
||||||
|
|
||||||
for(int c=0;c<coarse_vol;c++){
|
|
||||||
// BLAS_V[coarse_vol][nbasis][block_vol][words]
|
|
||||||
// BLAS_F[coarse_vol][nrhs][block_vol][words]
|
|
||||||
// BLAS_C[coarse_vol][nrhs][nbasis]
|
|
||||||
scalar * Vh = & BLAS_V[c*nbasis*block_vol*words];
|
|
||||||
scalar * Fh = & BLAS_F[c*nrhs*block_vol*words];
|
|
||||||
scalar * Ch = & BLAS_C[c*nrhs*nbasis];
|
|
||||||
acceleratorPut(Vd[c],Vh);
|
|
||||||
acceleratorPut(Fd[c],Fh);
|
|
||||||
acceleratorPut(Cd[c],Ch);
|
|
||||||
}
|
|
||||||
|
|
||||||
/////////////////////////////////////////
|
|
||||||
// Block promote:
|
|
||||||
// F_xr = Vxb Cbr (x coarse_vol)
|
|
||||||
/////////////////////////////////////////
|
|
||||||
|
|
||||||
int64_t vw = block_vol * words;
|
|
||||||
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
|
||||||
vw,nrhs,nbasis,
|
|
||||||
ComplexD(1.0),
|
|
||||||
Vd,
|
|
||||||
Cd,
|
|
||||||
ComplexD(0.0), // wipe out C
|
|
||||||
Fd);
|
|
||||||
BLAS.synchronise();
|
|
||||||
// std::cout << " blas call done"<<std::endl;
|
|
||||||
|
|
||||||
ExportFineGridVectors(fine, BLAS_F);
|
|
||||||
// std::cout << " exported "<<std::endl;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
@ -1,234 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: MultiRHSDeflation.h
|
|
||||||
|
|
||||||
Copyright (C) 2023
|
|
||||||
|
|
||||||
Author: Peter Boyle <pboyle@bnl.gov>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
|
|
||||||
/* Need helper object for BLAS accelerated mrhs projection
|
|
||||||
|
|
||||||
i) MultiRHS Deflation
|
|
||||||
|
|
||||||
Import Evecs -> nev x vol x internal
|
|
||||||
Import vector of Lattice objects -> nrhs x vol x internal
|
|
||||||
=> Cij (nrhs x Nev) via GEMM.
|
|
||||||
=> Guess (nrhs x vol x internal) = C x evecs (via GEMM)
|
|
||||||
Export
|
|
||||||
|
|
||||||
|
|
||||||
ii) MultiRHS block projection
|
|
||||||
|
|
||||||
Import basis -> nblock x nbasis x (block x internal)
|
|
||||||
Import vector of fine lattice objects -> nblock x nrhs x (block x internal)
|
|
||||||
|
|
||||||
=> coarse_(nrhs x nbasis )^block = via batched GEMM
|
|
||||||
|
|
||||||
iii) Alternate interface:
|
|
||||||
Import higher dim Lattice object-> vol x nrhs layout
|
|
||||||
|
|
||||||
*/
|
|
||||||
template<class Field>
|
|
||||||
class MultiRHSDeflation
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
|
|
||||||
typedef typename Field::scalar_type scalar;
|
|
||||||
typedef typename Field::scalar_object scalar_object;
|
|
||||||
|
|
||||||
int nev;
|
|
||||||
std::vector<RealD> eval;
|
|
||||||
GridBase *grid;
|
|
||||||
uint64_t vol;
|
|
||||||
uint64_t words;
|
|
||||||
|
|
||||||
deviceVector<scalar> BLAS_E; // nev x vol -- the eigenbasis (up to a 1/sqrt(lambda))
|
|
||||||
deviceVector<scalar> BLAS_R; // nrhs x vol -- the sources
|
|
||||||
deviceVector<scalar> BLAS_G; // nrhs x vol -- the guess
|
|
||||||
deviceVector<scalar> BLAS_C; // nrhs x nev -- the coefficients
|
|
||||||
|
|
||||||
MultiRHSDeflation(){};
|
|
||||||
~MultiRHSDeflation(){ Deallocate(); };
|
|
||||||
|
|
||||||
void Deallocate(void)
|
|
||||||
{
|
|
||||||
nev=0;
|
|
||||||
grid=nullptr;
|
|
||||||
vol=0;
|
|
||||||
words=0;
|
|
||||||
BLAS_E.resize(0);
|
|
||||||
BLAS_R.resize(0);
|
|
||||||
BLAS_C.resize(0);
|
|
||||||
BLAS_G.resize(0);
|
|
||||||
}
|
|
||||||
void Allocate(int _nev,GridBase *_grid)
|
|
||||||
{
|
|
||||||
nev=_nev;
|
|
||||||
grid=_grid;
|
|
||||||
vol = grid->lSites();
|
|
||||||
words = sizeof(scalar_object)/sizeof(scalar);
|
|
||||||
eval.resize(nev);
|
|
||||||
BLAS_E.resize (vol * words * nev );
|
|
||||||
std::cout << GridLogMessage << " Allocate for "<<nev<<" eigenvectors and volume "<<vol<<std::endl;
|
|
||||||
}
|
|
||||||
void ImportEigenVector(Field &evec,RealD &_eval, int ev)
|
|
||||||
{
|
|
||||||
assert(ev<eval.size());
|
|
||||||
std::cout << " ev " <<ev<<" eval "<<_eval<< std::endl;
|
|
||||||
eval[ev] = _eval;
|
|
||||||
|
|
||||||
int64_t offset = ev*vol*words;
|
|
||||||
autoView(v,evec,AcceleratorRead);
|
|
||||||
acceleratorCopyDeviceToDevice(&v[0],&BLAS_E[offset],sizeof(scalar_object)*vol);
|
|
||||||
|
|
||||||
}
|
|
||||||
void ImportEigenBasis(std::vector<Field> &evec,std::vector<RealD> &_eval)
|
|
||||||
{
|
|
||||||
ImportEigenBasis(evec,_eval,0,evec.size());
|
|
||||||
}
|
|
||||||
// Could use to import a batch of eigenvectors
|
|
||||||
void ImportEigenBasis(std::vector<Field> &evec,std::vector<RealD> &_eval, int _ev0, int _nev)
|
|
||||||
{
|
|
||||||
assert(_ev0+_nev<=evec.size());
|
|
||||||
|
|
||||||
Allocate(_nev,evec[0].Grid());
|
|
||||||
|
|
||||||
// Imports a sub-batch of eigenvectors, _ev0, ..., _ev0+_nev-1
|
|
||||||
for(int e=0;e<nev;e++){
|
|
||||||
std::cout << "Importing eigenvector "<<e<<" evalue "<<_eval[_ev0+e]<<std::endl;
|
|
||||||
ImportEigenVector(evec[_ev0+e],_eval[_ev0+e],e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void DeflateSources(std::vector<Field> &source,std::vector<Field> & guess)
|
|
||||||
{
|
|
||||||
int nrhs = source.size();
|
|
||||||
assert(source.size()==guess.size());
|
|
||||||
assert(grid == guess[0].Grid());
|
|
||||||
conformable(guess[0],source[0]);
|
|
||||||
|
|
||||||
int64_t vw = vol * words;
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "MultiRHSDelation for "<<nrhs<<" sources with "<<nev<<" eigenvectors "<<std::endl;
|
|
||||||
RealD t0 = usecond();
|
|
||||||
BLAS_R.resize(nrhs * vw); // cost free if size doesn't change
|
|
||||||
BLAS_G.resize(nrhs * vw); // cost free if size doesn't change
|
|
||||||
BLAS_C.resize(nev * nrhs);// cost free if size doesn't change
|
|
||||||
|
|
||||||
/////////////////////////////////////////////
|
|
||||||
// Copy in the multi-rhs sources
|
|
||||||
/////////////////////////////////////////////
|
|
||||||
// for(int r=0;r<nrhs;r++){
|
|
||||||
// std::cout << " source["<<r<<"] = "<<norm2(source[r])<<std::endl;
|
|
||||||
// }
|
|
||||||
for(int r=0;r<nrhs;r++){
|
|
||||||
int64_t offset = r*vw;
|
|
||||||
autoView(v,source[r],AcceleratorRead);
|
|
||||||
acceleratorCopyDeviceToDevice(&v[0],&BLAS_R[offset],sizeof(scalar_object)*vol);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* in Fortran column major notation (cuBlas order)
|
|
||||||
*
|
|
||||||
* Exe = [e1(x)][..][en(x)]
|
|
||||||
*
|
|
||||||
* Rxr = [r1(x)][..][rm(x)]
|
|
||||||
*
|
|
||||||
* C_er = E^dag R
|
|
||||||
* C_er = C_er / lambda_e
|
|
||||||
* G_xr = Exe Cer
|
|
||||||
*/
|
|
||||||
deviceVector<scalar *> Ed(1);
|
|
||||||
deviceVector<scalar *> Rd(1);
|
|
||||||
deviceVector<scalar *> Cd(1);
|
|
||||||
deviceVector<scalar *> Gd(1);
|
|
||||||
|
|
||||||
scalar * Eh = & BLAS_E[0];
|
|
||||||
scalar * Rh = & BLAS_R[0];
|
|
||||||
scalar * Ch = & BLAS_C[0];
|
|
||||||
scalar * Gh = & BLAS_G[0];
|
|
||||||
|
|
||||||
acceleratorPut(Ed[0],Eh);
|
|
||||||
acceleratorPut(Rd[0],Rh);
|
|
||||||
acceleratorPut(Cd[0],Ch);
|
|
||||||
acceleratorPut(Gd[0],Gh);
|
|
||||||
|
|
||||||
GridBLAS BLAS;
|
|
||||||
|
|
||||||
/////////////////////////////////////////
|
|
||||||
// C_er = E^dag R
|
|
||||||
/////////////////////////////////////////
|
|
||||||
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
|
||||||
nev,nrhs,vw,
|
|
||||||
ComplexD(1.0),
|
|
||||||
Ed,
|
|
||||||
Rd,
|
|
||||||
ComplexD(0.0), // wipe out C
|
|
||||||
Cd);
|
|
||||||
BLAS.synchronise();
|
|
||||||
|
|
||||||
assert(BLAS_C.size()==nev*nrhs);
|
|
||||||
|
|
||||||
std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nev -- the coefficients
|
|
||||||
acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
|
|
||||||
grid->GlobalSumVector(&HOST_C[0],nev*nrhs);
|
|
||||||
for(int e=0;e<nev;e++){
|
|
||||||
RealD lam(1.0/eval[e]);
|
|
||||||
for(int r=0;r<nrhs;r++){
|
|
||||||
int off = e+nev*r;
|
|
||||||
HOST_C[off]=HOST_C[off] * lam;
|
|
||||||
// std::cout << "C["<<e<<"]["<<r<<"] ="<<HOST_C[off]<< " eval[e] "<<eval[e] <<std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
acceleratorCopyToDevice(&HOST_C[0],&BLAS_C[0],BLAS_C.size()*sizeof(scalar));
|
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////
|
|
||||||
// Guess G_xr = Exe Cer
|
|
||||||
/////////////////////////////////////////
|
|
||||||
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
|
||||||
vw,nrhs,nev,
|
|
||||||
ComplexD(1.0),
|
|
||||||
Ed, // x . nev
|
|
||||||
Cd, // nev . nrhs
|
|
||||||
ComplexD(0.0),
|
|
||||||
Gd);
|
|
||||||
BLAS.synchronise();
|
|
||||||
|
|
||||||
///////////////////////////////////////
|
|
||||||
// Copy out the multirhs
|
|
||||||
///////////////////////////////////////
|
|
||||||
for(int r=0;r<nrhs;r++){
|
|
||||||
int64_t offset = r*vw;
|
|
||||||
autoView(v,guess[r],AcceleratorWrite);
|
|
||||||
acceleratorCopyDeviceToDevice(&BLAS_G[offset],&v[0],sizeof(scalar_object)*vol);
|
|
||||||
}
|
|
||||||
RealD t1 = usecond();
|
|
||||||
std::cout << GridLogMessage << "MultiRHSDelation for "<<nrhs<<" sources with "<<nev<<" eigenvectors took " << (t1-t0)/1e3 <<" ms"<<std::endl;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
@ -41,7 +41,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
*/
|
*/
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
|
||||||
template<class Field>
|
template<class Field>
|
||||||
class TwoLevelCG : public LinearFunction<Field>
|
class TwoLevelCG : public LinearFunction<Field>
|
||||||
{
|
{
|
||||||
@ -70,7 +69,7 @@ class TwoLevelCG : public LinearFunction<Field>
|
|||||||
|
|
||||||
virtual void operator() (const Field &src, Field &x)
|
virtual void operator() (const Field &src, Field &x)
|
||||||
{
|
{
|
||||||
std::cout << GridLogMessage<<"HDCG: fPcg starting single RHS"<<std::endl;
|
std::cout << GridLogMessage<<"HDCG: fPcg starting"<<std::endl;
|
||||||
RealD f;
|
RealD f;
|
||||||
RealD rtzp,rtz,a,d,b;
|
RealD rtzp,rtz,a,d,b;
|
||||||
RealD rptzp;
|
RealD rptzp;
|
||||||
@ -247,7 +246,7 @@ class TwoLevelCG : public LinearFunction<Field>
|
|||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Set up history vectors
|
// Set up history vectors
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
int mmax = 3;
|
int mmax = 2;
|
||||||
std::cout << GridLogMessage<<"HDCG: fPcg allocating"<<std::endl;
|
std::cout << GridLogMessage<<"HDCG: fPcg allocating"<<std::endl;
|
||||||
src[0].Grid()->Barrier();
|
src[0].Grid()->Barrier();
|
||||||
std::vector<std::vector<Field> > p(nrhs); for(int r=0;r<nrhs;r++) p[r].resize(mmax,grid);
|
std::vector<std::vector<Field> > p(nrhs); for(int r=0;r<nrhs;r++) p[r].resize(mmax,grid);
|
||||||
@ -279,9 +278,9 @@ class TwoLevelCG : public LinearFunction<Field>
|
|||||||
//////////////////////////
|
//////////////////////////
|
||||||
// x0 = Vstart -- possibly modify guess
|
// x0 = Vstart -- possibly modify guess
|
||||||
//////////////////////////
|
//////////////////////////
|
||||||
Vstart(x,src);
|
|
||||||
|
|
||||||
for(int rhs=0;rhs<nrhs;rhs++){
|
for(int rhs=0;rhs<nrhs;rhs++){
|
||||||
|
Vstart(x[rhs],src[rhs]);
|
||||||
|
|
||||||
// r0 = b -A x0
|
// r0 = b -A x0
|
||||||
_FineLinop.HermOp(x[rhs],mmp[rhs][0]);
|
_FineLinop.HermOp(x[rhs],mmp[rhs][0]);
|
||||||
axpy (r[rhs], -1.0,mmp[rhs][0], src[rhs]); // Recomputes r=src-Ax0
|
axpy (r[rhs], -1.0,mmp[rhs][0], src[rhs]); // Recomputes r=src-Ax0
|
||||||
@ -325,8 +324,6 @@ class TwoLevelCG : public LinearFunction<Field>
|
|||||||
|
|
||||||
// Compute z = M x (for *all* RHS)
|
// Compute z = M x (for *all* RHS)
|
||||||
PcgM1(r,z);
|
PcgM1(r,z);
|
||||||
std::cout << GridLogMessage<<"HDCG::fPcg M1 complete"<<std::endl;
|
|
||||||
grid->Barrier();
|
|
||||||
|
|
||||||
RealD max_rn=0.0;
|
RealD max_rn=0.0;
|
||||||
for(int rhs=0;rhs<nrhs;rhs++){
|
for(int rhs=0;rhs<nrhs;rhs++){
|
||||||
@ -400,19 +397,12 @@ class TwoLevelCG : public LinearFunction<Field>
|
|||||||
|
|
||||||
virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out)
|
virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out)
|
||||||
{
|
{
|
||||||
std::cout << "PcgM1 default (cheat) mrhs version"<<std::endl;
|
std::cout << "PcgM1 default (cheat) mrhs versoin"<<std::endl;
|
||||||
for(int rhs=0;rhs<in.size();rhs++){
|
for(int rhs=0;rhs<in.size();rhs++){
|
||||||
this->PcgM1(in[rhs],out[rhs]);
|
this->PcgM1(in[rhs],out[rhs]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
virtual void PcgM1(Field & in, Field & out) =0;
|
virtual void PcgM1(Field & in, Field & out) =0;
|
||||||
virtual void Vstart(std::vector<Field> & x,std::vector<Field> & src)
|
|
||||||
{
|
|
||||||
std::cout << "Vstart default (cheat) mrhs version"<<std::endl;
|
|
||||||
for(int rhs=0;rhs<x.size();rhs++){
|
|
||||||
this->Vstart(x[rhs],src[rhs]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
virtual void Vstart(Field & x,const Field & src)=0;
|
virtual void Vstart(Field & x,const Field & src)=0;
|
||||||
|
|
||||||
virtual void PcgM2(const Field & in, Field & out) {
|
virtual void PcgM2(const Field & in, Field & out) {
|
||||||
@ -544,130 +534,76 @@ class TwoLevelADEF2mrhs : public TwoLevelADEF2<Field,CoarseField,Aggregation>
|
|||||||
public:
|
public:
|
||||||
GridBase *coarsegridmrhs;
|
GridBase *coarsegridmrhs;
|
||||||
LinearFunction<CoarseField> &_CoarseSolverMrhs;
|
LinearFunction<CoarseField> &_CoarseSolverMrhs;
|
||||||
LinearFunction<CoarseField> &_CoarseSolverPreciseMrhs;
|
|
||||||
LinearFunction<CoarseField> &_CoarseGuesser;
|
LinearFunction<CoarseField> &_CoarseGuesser;
|
||||||
TwoLevelADEF2mrhs(RealD tol,
|
TwoLevelADEF2mrhs(RealD tol,
|
||||||
Integer maxit,
|
Integer maxit,
|
||||||
LinearOperatorBase<Field> &FineLinop,
|
LinearOperatorBase<Field> &FineLinop,
|
||||||
LinearFunction<Field> &Smoother,
|
LinearFunction<Field> &Smoother,
|
||||||
// LinearFunction<CoarseField> &CoarseSolver,
|
LinearFunction<CoarseField> &CoarseSolver,
|
||||||
// LinearFunction<CoarseField> &CoarseSolverPrecise,
|
LinearFunction<CoarseField> &CoarseSolverPrecise,
|
||||||
LinearFunction<CoarseField> &CoarseSolverMrhs,
|
LinearFunction<CoarseField> &CoarseSolverMrhs,
|
||||||
LinearFunction<CoarseField> &CoarseSolverPreciseMrhs,
|
|
||||||
LinearFunction<CoarseField> &CoarseGuesser,
|
LinearFunction<CoarseField> &CoarseGuesser,
|
||||||
GridBase *rhsgrid,
|
GridBase *rhsgrid,
|
||||||
Aggregation &Aggregates) :
|
Aggregation &Aggregates) :
|
||||||
TwoLevelADEF2<Field,CoarseField,Aggregation>(tol, maxit,FineLinop,Smoother,CoarseSolverMrhs,CoarseSolverPreciseMrhs,Aggregates),
|
TwoLevelADEF2<Field,CoarseField,Aggregation>(tol, maxit,FineLinop,Smoother,CoarseSolver,CoarseSolverPrecise,Aggregates),
|
||||||
_CoarseSolverMrhs(CoarseSolverMrhs),
|
_CoarseSolverMrhs(CoarseSolverMrhs),
|
||||||
_CoarseSolverPreciseMrhs(CoarseSolverPreciseMrhs),
|
|
||||||
_CoarseGuesser(CoarseGuesser)
|
_CoarseGuesser(CoarseGuesser)
|
||||||
{
|
{
|
||||||
coarsegridmrhs = rhsgrid;
|
coarsegridmrhs = rhsgrid;
|
||||||
};
|
};
|
||||||
|
|
||||||
virtual void Vstart(std::vector<Field> & x,std::vector<Field> & src)
|
|
||||||
{
|
|
||||||
int nrhs=x.size();
|
|
||||||
std::cout << GridLogMessage<<"HDCG: fPcg Vstart for "<<nrhs<<" right hand sides" <<std::endl;
|
|
||||||
///////////////////////////////////
|
|
||||||
// Choose x_0 such that
|
|
||||||
// x_0 = guess + (A_ss^inv) r_s = guess + Ass_inv [src -Aguess]
|
|
||||||
// = [1 - Ass_inv A] Guess + Assinv src
|
|
||||||
// = P^T guess + Assinv src
|
|
||||||
// = Vstart [Tang notation]
|
|
||||||
// This gives:
|
|
||||||
// W^T (src - A x_0) = src_s - A guess_s - r_s
|
|
||||||
// = src_s - (A guess)_s - src_s + (A guess)_s
|
|
||||||
// = 0
|
|
||||||
///////////////////////////////////
|
|
||||||
CoarseField PleftProj(this->coarsegrid);
|
|
||||||
CoarseField PleftMss_proj(this->coarsegrid);
|
|
||||||
|
|
||||||
CoarseField PleftProjMrhs(this->coarsegridmrhs);
|
|
||||||
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
|
|
||||||
|
|
||||||
std::cout << GridLogMessage<<"HDCG: fPcg Vstart Mrhs projecting "<<std::endl;
|
|
||||||
|
|
||||||
for(int rhs=0;rhs<nrhs;rhs++) {
|
|
||||||
this->_Aggregates.ProjectToSubspace(PleftProj,src[rhs]); // can optimise later
|
|
||||||
InsertSliceFast(PleftProj,PleftProjMrhs,rhs,0);
|
|
||||||
this->_CoarseGuesser(PleftProj,PleftMss_proj);
|
|
||||||
InsertSliceFast(PleftMss_proj,PleftMss_projMrhs,rhs,0);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cout << GridLogMessage<<"HDCG: fPcg Vstart Mrhs coarse solve "<<std::endl;
|
|
||||||
this->_CoarseSolverPreciseMrhs(PleftProjMrhs,PleftMss_projMrhs); // Ass^{-1} r_s
|
|
||||||
|
|
||||||
std::cout << GridLogMessage<<"HDCG: fPcg Vstart promote "<<std::endl;
|
|
||||||
for(int rhs=0;rhs<nrhs;rhs++) {
|
|
||||||
ExtractSliceFast(PleftMss_proj,PleftMss_projMrhs,rhs,0);
|
|
||||||
this->_Aggregates.PromoteFromSubspace(PleftMss_proj,x[rhs]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out){
|
virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out){
|
||||||
|
|
||||||
int nrhs=in.size();
|
int nrhs=in.size();
|
||||||
std::cout << " mrhs PcgM1 for "<<nrhs<<" right hand sides"<<std::endl;
|
std::cout << " mrhs PcgM1 for "<<nrhs<<" right hand sides"<<std::endl;
|
||||||
MemoryManager::Print();
|
|
||||||
// [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
|
// [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
|
||||||
Field tmp(this->grid);
|
Field tmp(this->grid);
|
||||||
std::vector<Field> Min(nrhs,this->grid);
|
std::vector<Field> Min(nrhs,this->grid);
|
||||||
std::cout << " mrhs PcgM1 Min "<<std::endl;
|
|
||||||
CoarseField PleftProj(this->coarsegrid);
|
CoarseField PleftProj(this->coarsegrid);
|
||||||
CoarseField PleftMss_proj(this->coarsegrid);
|
CoarseField PleftMss_proj(this->coarsegrid);
|
||||||
|
|
||||||
CoarseField PleftProjMrhs(this->coarsegridmrhs);
|
CoarseField PleftProjMrhs(this->coarsegridmrhs);
|
||||||
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
|
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
|
||||||
std::cout << " mrhs Coarse ops "<<std::endl;
|
|
||||||
|
|
||||||
// Really want the coarse solver
|
|
||||||
// to do the guessing itself, knowing the eigenvectors.
|
|
||||||
// The projection to coarse space is in aggregates
|
|
||||||
// If the Aggregates have a layout change option
|
|
||||||
// they could formulate as a BLAS routine.
|
|
||||||
// Put the routines in this object
|
|
||||||
for(int rhs=0;rhs<nrhs;rhs++) {
|
for(int rhs=0;rhs<nrhs;rhs++) {
|
||||||
|
this->grid->Barrier();
|
||||||
std::cout << GridLogMessage<<" Smoother for "<<rhs<<std::endl;
|
std::cout << " Calling smoother for "<<rhs<<std::endl;
|
||||||
|
this->grid->Barrier();
|
||||||
this->_Smoother(in[rhs],Min[rhs]);
|
this->_Smoother(in[rhs],Min[rhs]);
|
||||||
|
this->grid->Barrier();
|
||||||
std::cout << GridLogMessage<<" HermOp for "<<rhs<<std::endl;
|
std::cout << " smoother done "<<rhs<<std::endl;
|
||||||
|
this->grid->Barrier();
|
||||||
this->_FineLinop.HermOp(Min[rhs],out[rhs]);
|
this->_FineLinop.HermOp(Min[rhs],out[rhs]);
|
||||||
|
this->grid->Barrier();
|
||||||
|
std::cout << " Hermop for "<<rhs<<std::endl;
|
||||||
|
this->grid->Barrier();
|
||||||
axpy(tmp,-1.0,out[rhs],in[rhs]); // tmp = in - A Min
|
axpy(tmp,-1.0,out[rhs],in[rhs]); // tmp = in - A Min
|
||||||
|
this->grid->Barrier();
|
||||||
// Was
|
std::cout << " axpy "<<rhs<<std::endl;
|
||||||
// this->_Aggregates.ProjectToSubspace(PleftProj,tmp); // can optimise later
|
this->grid->Barrier();
|
||||||
// Now:
|
this->_Aggregates.ProjectToSubspace(PleftProj,tmp); // can optimise later
|
||||||
std::cout << GridLogMessage<<" blockProject for "<<rhs<<std::endl;
|
this->grid->Barrier();
|
||||||
blockProjectFast(PleftProj,tmp,this->_Aggregates.subspace);
|
std::cout << " project "<<rhs<<std::endl;
|
||||||
|
this->grid->Barrier();
|
||||||
std::cout << GridLogMessage<<" InsertSlice for "<<rhs<<std::endl;
|
|
||||||
InsertSlice(PleftProj,PleftProjMrhs,rhs,0);
|
InsertSlice(PleftProj,PleftProjMrhs,rhs,0);
|
||||||
|
this->grid->Barrier();
|
||||||
std::cout << GridLogMessage<<" CoarseGuesser for "<<rhs<<std::endl;
|
std::cout << " insert rhs "<<rhs<<std::endl;
|
||||||
|
this->grid->Barrier();
|
||||||
this->_CoarseGuesser(PleftProj,PleftMss_proj);
|
this->_CoarseGuesser(PleftProj,PleftMss_proj);
|
||||||
|
this->grid->Barrier();
|
||||||
std::cout << GridLogMessage<<" InsertSlice for "<<rhs<<std::endl;
|
std::cout << " insert guess "<<rhs<<std::endl;
|
||||||
|
this->grid->Barrier();
|
||||||
InsertSlice(PleftMss_proj,PleftMss_projMrhs,rhs,0);
|
InsertSlice(PleftMss_proj,PleftMss_projMrhs,rhs,0);
|
||||||
}
|
}
|
||||||
MemoryManager::Print();
|
|
||||||
|
|
||||||
std::cout << " Coarse solve "<<std::endl;
|
std::cout << " Coarse solve "<<std::endl;
|
||||||
this->_CoarseSolverMrhs(PleftProjMrhs,PleftMss_projMrhs); // Ass^{-1} [in - A Min]_s
|
this->_CoarseSolverMrhs(PleftProjMrhs,PleftMss_projMrhs); // Ass^{-1} [in - A Min]_s
|
||||||
std::cout << " Coarse solve done"<<std::endl;
|
|
||||||
MemoryManager::Print();
|
|
||||||
|
|
||||||
for(int rhs=0;rhs<nrhs;rhs++) {
|
for(int rhs=0;rhs<nrhs;rhs++) {
|
||||||
std::cout << GridLogMessage<<" Extract for "<<rhs<<std::endl;
|
|
||||||
ExtractSlice(PleftMss_proj,PleftMss_projMrhs,rhs,0);
|
ExtractSlice(PleftMss_proj,PleftMss_projMrhs,rhs,0);
|
||||||
std::cout << GridLogMessage<<" Promote for "<<rhs<<std::endl;
|
|
||||||
this->_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]
|
this->_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]
|
||||||
// std::cout << " add for "<<rhs<<std::endl;
|
|
||||||
axpy(out[rhs],1.0,Min[rhs],tmp); // Min+tmp
|
axpy(out[rhs],1.0,Min[rhs],tmp); // Min+tmp
|
||||||
}
|
}
|
||||||
MemoryManager::Print();
|
|
||||||
std::cout << " Extracted "<<std::endl;
|
std::cout << " Extracted "<<std::endl;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -42,8 +42,6 @@ inline RealD AggregatePowerLaw(RealD x)
|
|||||||
template<class Fobj,class CComplex,int nbasis>
|
template<class Fobj,class CComplex,int nbasis>
|
||||||
class Aggregation {
|
class Aggregation {
|
||||||
public:
|
public:
|
||||||
constexpr int Nbasis(void) { return nbasis; };
|
|
||||||
|
|
||||||
typedef iVector<CComplex,nbasis > siteVector;
|
typedef iVector<CComplex,nbasis > siteVector;
|
||||||
typedef Lattice<siteVector> CoarseVector;
|
typedef Lattice<siteVector> CoarseVector;
|
||||||
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
|
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
|
||||||
|
@ -55,12 +55,9 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
typedef int32_t gridblasHandle_t;
|
typedef int32_t gridblasHandle_t;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ;
|
|
||||||
|
|
||||||
class GridBLAS {
|
class GridBLAS {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
|
||||||
static gridblasHandle_t gridblasHandle;
|
static gridblasHandle_t gridblasHandle;
|
||||||
static int gridblasInit;
|
static int gridblasInit;
|
||||||
|
|
||||||
@ -77,7 +74,6 @@ public:
|
|||||||
#endif
|
#endif
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
#endif
|
#endif
|
||||||
gridblasInit=1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -112,71 +108,37 @@ public:
|
|||||||
accelerator_barrier();
|
accelerator_barrier();
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
void benchmark(int nbasis, int nrhs, int coarseVol, int nstencil)
|
||||||
void gemmBatched(int m,int n, int k,
|
|
||||||
ComplexD alpha,
|
|
||||||
deviceVector<ComplexD*> &Amk, // pointer list to matrices
|
|
||||||
deviceVector<ComplexD*> &Bkn,
|
|
||||||
ComplexD beta,
|
|
||||||
deviceVector<ComplexD*> &Cmn)
|
|
||||||
{
|
{
|
||||||
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
int32_t N_A = nbasis*nbasis*coarseVol*nstencil;
|
||||||
m,n,k,
|
int32_t N_B = nbasis*nrhs*coarseVol*nstencil; // One leg of stencil at a time
|
||||||
|
int32_t N_C = nbasis*nrhs*coarseVol*nstencil;
|
||||||
|
deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD));
|
||||||
|
deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD));
|
||||||
|
deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD));
|
||||||
|
ComplexD alpha(1.0);
|
||||||
|
ComplexD beta (1.0);
|
||||||
|
for(int i=0;i<10;i++){
|
||||||
|
RealD t0 = usecond();
|
||||||
|
for(int s=0;s<nstencil;s++){
|
||||||
|
gemmStridedBatched(nbasis,nrhs,nbasis,
|
||||||
alpha,
|
alpha,
|
||||||
Amk,
|
&A[0], // m x k
|
||||||
Bkn,
|
&B[0], // k x n
|
||||||
beta,
|
beta,
|
||||||
Cmn);
|
&C[0], // m x n
|
||||||
|
coarseVol);
|
||||||
}
|
}
|
||||||
void gemmBatched(int m,int n, int k,
|
synchronise();
|
||||||
ComplexF alpha,
|
RealD t1 = usecond();
|
||||||
deviceVector<ComplexF*> &Amk, // pointer list to matrices
|
RealD flops = 8.0*nbasis*nbasis*nrhs*coarseVol*nstencil;
|
||||||
deviceVector<ComplexF*> &Bkn,
|
RealD bytes = 1.0*sizeof(ComplexD)*(nbasis*nbasis+nbasis*nrhs*3)*coarseVol*nstencil;
|
||||||
ComplexF beta,
|
std::cout << " batched Blas call "<<i<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
deviceVector<ComplexF*> &Cmn)
|
std::cout << " batched Blas call "<<i<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
{
|
|
||||||
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
|
||||||
m,n,k,
|
|
||||||
alpha,
|
|
||||||
Amk,
|
|
||||||
Bkn,
|
|
||||||
beta,
|
|
||||||
Cmn);
|
|
||||||
}
|
}
|
||||||
void gemmBatched(int m,int n, int k,
|
|
||||||
RealD alpha,
|
|
||||||
deviceVector<RealD*> &Amk, // pointer list to matrices
|
|
||||||
deviceVector<RealD*> &Bkn,
|
|
||||||
RealD beta,
|
|
||||||
deviceVector<RealD*> &Cmn)
|
|
||||||
{
|
|
||||||
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
|
||||||
m,n,k,
|
|
||||||
alpha,
|
|
||||||
Amk,
|
|
||||||
Bkn,
|
|
||||||
beta,
|
|
||||||
Cmn);
|
|
||||||
}
|
|
||||||
void gemmBatched(int m,int n, int k,
|
|
||||||
RealF alpha,
|
|
||||||
deviceVector<RealF*> &Amk, // pointer list to matrices
|
|
||||||
deviceVector<RealF*> &Bkn,
|
|
||||||
RealF beta,
|
|
||||||
deviceVector<RealF*> &Cmn)
|
|
||||||
{
|
|
||||||
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
|
||||||
m,n,k,
|
|
||||||
alpha,
|
|
||||||
Amk,
|
|
||||||
Bkn,
|
|
||||||
beta,
|
|
||||||
Cmn);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void gemmBatched(GridBLASOperation_t OpA,
|
void gemmBatched(int m,int n, int k,
|
||||||
GridBLASOperation_t OpB,
|
|
||||||
int m,int n, int k,
|
|
||||||
ComplexD alpha,
|
ComplexD alpha,
|
||||||
deviceVector<ComplexD*> &Amk, // pointer list to matrices
|
deviceVector<ComplexD*> &Amk, // pointer list to matrices
|
||||||
deviceVector<ComplexD*> &Bkn,
|
deviceVector<ComplexD*> &Bkn,
|
||||||
@ -185,36 +147,23 @@ public:
|
|||||||
{
|
{
|
||||||
RealD t2=usecond();
|
RealD t2=usecond();
|
||||||
int32_t batchCount = Amk.size();
|
int32_t batchCount = Amk.size();
|
||||||
assert(Bkn.size()==batchCount);
|
// Use C-row major storage, so transpose calls
|
||||||
assert(Cmn.size()==batchCount);
|
|
||||||
|
|
||||||
int lda = m; // m x k column major
|
int lda = m; // m x k column major
|
||||||
int ldb = k; // k x n column major
|
int ldb = k; // k x n column major
|
||||||
int ldc = m; // m x b column major
|
int ldc = m; // m x b column major
|
||||||
if(OpA!=GridBLAS_OP_N)
|
|
||||||
lda = k;
|
|
||||||
if(OpB!=GridBLAS_OP_N)
|
|
||||||
ldb = n;
|
|
||||||
|
|
||||||
static deviceVector<ComplexD> alpha_p(1);
|
static deviceVector<ComplexD> alpha_p(1);
|
||||||
static deviceVector<ComplexD> beta_p(1);
|
static deviceVector<ComplexD> beta_p(1);
|
||||||
// can prestore the 1 and the zero on device
|
// can prestore the 1 and the zero on device
|
||||||
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
|
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
|
||||||
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
|
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
|
||||||
RealD t0=usecond();
|
RealD t0=usecond();
|
||||||
// std::cout << "ZgemmBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
|
// std::cout << "hipblasZgemmBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
|
||||||
|
assert(Bkn.size()==batchCount);
|
||||||
|
assert(Cmn.size()==batchCount);
|
||||||
#ifdef GRID_HIP
|
#ifdef GRID_HIP
|
||||||
hipblasOperation_t hOpA;
|
|
||||||
hipblasOperation_t hOpB;
|
|
||||||
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
|
|
||||||
auto err = hipblasZgemmBatched(gridblasHandle,
|
auto err = hipblasZgemmBatched(gridblasHandle,
|
||||||
hOpA,
|
HIPBLAS_OP_N,
|
||||||
hOpB,
|
HIPBLAS_OP_N,
|
||||||
m,n,k,
|
m,n,k,
|
||||||
(hipblasDoubleComplex *) &alpha_p[0],
|
(hipblasDoubleComplex *) &alpha_p[0],
|
||||||
(hipblasDoubleComplex **)&Amk[0], lda,
|
(hipblasDoubleComplex **)&Amk[0], lda,
|
||||||
@ -226,17 +175,9 @@ public:
|
|||||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
cublasOperation_t hOpA;
|
|
||||||
cublasOperation_t hOpB;
|
|
||||||
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
|
|
||||||
auto err = cublasZgemmBatched(gridblasHandle,
|
auto err = cublasZgemmBatched(gridblasHandle,
|
||||||
hOpA,
|
CUBLAS_OP_N,
|
||||||
hOpB,
|
CUBLAS_OP_N,
|
||||||
m,n,k,
|
m,n,k,
|
||||||
(cuDoubleComplex *) &alpha_p[0],
|
(cuDoubleComplex *) &alpha_p[0],
|
||||||
(cuDoubleComplex **)&Amk[0], lda,
|
(cuDoubleComplex **)&Amk[0], lda,
|
||||||
@ -263,18 +204,15 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
// synchronise();
|
|
||||||
RealD t1=usecond();
|
RealD t1=usecond();
|
||||||
RealD flops = 8.0*m*n*k*batchCount;
|
RealD flops = 8.0*m*n*k*batchCount;
|
||||||
RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
|
RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
|
||||||
// std::cout <<GridLogMessage<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
|
// std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
|
||||||
// std::cout <<GridLogMessage<< " batched Blas zGemm call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
// std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
// std::cout <<GridLogMessage<< " batched Blas zGemm call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
// std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void gemmBatched(GridBLASOperation_t OpA,
|
void gemmBatched(int m,int n, int k,
|
||||||
GridBLASOperation_t OpB,
|
|
||||||
int m,int n, int k,
|
|
||||||
ComplexF alpha,
|
ComplexF alpha,
|
||||||
deviceVector<ComplexF*> &Amk, // pointer list to matrices
|
deviceVector<ComplexF*> &Amk, // pointer list to matrices
|
||||||
deviceVector<ComplexF*> &Bkn,
|
deviceVector<ComplexF*> &Bkn,
|
||||||
@ -283,35 +221,23 @@ public:
|
|||||||
{
|
{
|
||||||
RealD t2=usecond();
|
RealD t2=usecond();
|
||||||
int32_t batchCount = Amk.size();
|
int32_t batchCount = Amk.size();
|
||||||
|
// Use C-row major storage, so transpose calls
|
||||||
int lda = m; // m x k column major
|
int lda = m; // m x k column major
|
||||||
int ldb = k; // k x n column major
|
int ldb = k; // k x n column major
|
||||||
int ldc = m; // m x b column major
|
int ldc = m; // m x b column major
|
||||||
if(OpA!=GridBLAS_OP_N)
|
|
||||||
lda = k;
|
|
||||||
if(OpB!=GridBLAS_OP_N)
|
|
||||||
ldb = n;
|
|
||||||
static deviceVector<ComplexF> alpha_p(1);
|
static deviceVector<ComplexF> alpha_p(1);
|
||||||
static deviceVector<ComplexF> beta_p(1);
|
static deviceVector<ComplexF> beta_p(1);
|
||||||
// can prestore the 1 and the zero on device
|
// can prestore the 1 and the zero on device
|
||||||
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF));
|
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF));
|
||||||
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
|
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
|
||||||
RealD t0=usecond();
|
RealD t0=usecond();
|
||||||
|
// std::cout << "hipblasZgemmBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
|
||||||
assert(Bkn.size()==batchCount);
|
assert(Bkn.size()==batchCount);
|
||||||
assert(Cmn.size()==batchCount);
|
assert(Cmn.size()==batchCount);
|
||||||
#ifdef GRID_HIP
|
#ifdef GRID_HIP
|
||||||
hipblasOperation_t hOpA;
|
|
||||||
hipblasOperation_t hOpB;
|
|
||||||
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
|
|
||||||
auto err = hipblasCgemmBatched(gridblasHandle,
|
auto err = hipblasCgemmBatched(gridblasHandle,
|
||||||
hOpA,
|
HIPBLAS_OP_N,
|
||||||
hOpB,
|
HIPBLAS_OP_N,
|
||||||
m,n,k,
|
m,n,k,
|
||||||
(hipblasComplex *) &alpha_p[0],
|
(hipblasComplex *) &alpha_p[0],
|
||||||
(hipblasComplex **)&Amk[0], lda,
|
(hipblasComplex **)&Amk[0], lda,
|
||||||
@ -319,21 +245,13 @@ public:
|
|||||||
(hipblasComplex *) &beta_p[0],
|
(hipblasComplex *) &beta_p[0],
|
||||||
(hipblasComplex **)&Cmn[0], ldc,
|
(hipblasComplex **)&Cmn[0], ldc,
|
||||||
batchCount);
|
batchCount);
|
||||||
|
// std::cout << " hipblas return code " <<(int)err<<std::endl;
|
||||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
cublasOperation_t hOpA;
|
|
||||||
cublasOperation_t hOpB;
|
|
||||||
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
|
|
||||||
auto err = cublasCgemmBatched(gridblasHandle,
|
auto err = cublasCgemmBatched(gridblasHandle,
|
||||||
hOpA,
|
CUBLAS_OP_N,
|
||||||
hOpB,
|
CUBLAS_OP_N,
|
||||||
m,n,k,
|
m,n,k,
|
||||||
(cuComplex *) &alpha_p[0],
|
(cuComplex *) &alpha_p[0],
|
||||||
(cuComplex **)&Amk[0], lda,
|
(cuComplex **)&Amk[0], lda,
|
||||||
@ -363,15 +281,16 @@ public:
|
|||||||
RealD t1=usecond();
|
RealD t1=usecond();
|
||||||
RealD flops = 8.0*m*n*k*batchCount;
|
RealD flops = 8.0*m*n*k*batchCount;
|
||||||
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
|
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
|
||||||
|
// std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
|
||||||
|
// std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
|
// std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// Single precision real GEMM
|
// Single precision real GEMM
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
void gemmBatched(GridBLASOperation_t OpA,
|
void gemmBatched(int m,int n, int k,
|
||||||
GridBLASOperation_t OpB,
|
|
||||||
int m,int n, int k,
|
|
||||||
RealF alpha,
|
RealF alpha,
|
||||||
deviceVector<RealF*> &Amk, // pointer list to matrices
|
deviceVector<RealF*> &Amk, // pointer list to matrices
|
||||||
deviceVector<RealF*> &Bkn,
|
deviceVector<RealF*> &Bkn,
|
||||||
@ -380,35 +299,23 @@ public:
|
|||||||
{
|
{
|
||||||
RealD t2=usecond();
|
RealD t2=usecond();
|
||||||
int32_t batchCount = Amk.size();
|
int32_t batchCount = Amk.size();
|
||||||
|
// Use C-row major storage, so transpose calls
|
||||||
int lda = m; // m x k column major
|
int lda = m; // m x k column major
|
||||||
int ldb = k; // k x n column major
|
int ldb = k; // k x n column major
|
||||||
int ldc = m; // m x b column major
|
int ldc = m; // m x b column major
|
||||||
if(OpA!=GridBLAS_OP_N)
|
|
||||||
lda = k;
|
|
||||||
if(OpB!=GridBLAS_OP_N)
|
|
||||||
ldb = n;
|
|
||||||
static deviceVector<RealF> alpha_p(1);
|
static deviceVector<RealF> alpha_p(1);
|
||||||
static deviceVector<RealF> beta_p(1);
|
static deviceVector<RealF> beta_p(1);
|
||||||
// can prestore the 1 and the zero on device
|
// can prestore the 1 and the zero on device
|
||||||
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF));
|
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF));
|
||||||
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
|
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
|
||||||
RealD t0=usecond();
|
RealD t0=usecond();
|
||||||
|
// std::cout << "hipblasZgemmBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
|
||||||
assert(Bkn.size()==batchCount);
|
assert(Bkn.size()==batchCount);
|
||||||
assert(Cmn.size()==batchCount);
|
assert(Cmn.size()==batchCount);
|
||||||
#ifdef GRID_HIP
|
#ifdef GRID_HIP
|
||||||
hipblasOperation_t hOpA;
|
|
||||||
hipblasOperation_t hOpB;
|
|
||||||
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
|
|
||||||
auto err = hipblasSgemmBatched(gridblasHandle,
|
auto err = hipblasSgemmBatched(gridblasHandle,
|
||||||
hOpA,
|
HIPBLAS_OP_N,
|
||||||
hOpB,
|
HIPBLAS_OP_N,
|
||||||
m,n,k,
|
m,n,k,
|
||||||
(float *) &alpha_p[0],
|
(float *) &alpha_p[0],
|
||||||
(float **)&Amk[0], lda,
|
(float **)&Amk[0], lda,
|
||||||
@ -419,17 +326,9 @@ public:
|
|||||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
cublasOperation_t hOpA;
|
|
||||||
cublasOperation_t hOpB;
|
|
||||||
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
|
|
||||||
auto err = cublasSgemmBatched(gridblasHandle,
|
auto err = cublasSgemmBatched(gridblasHandle,
|
||||||
hOpA,
|
CUBLAS_OP_N,
|
||||||
hOpB,
|
CUBLAS_OP_N,
|
||||||
m,n,k,
|
m,n,k,
|
||||||
(float *) &alpha_p[0],
|
(float *) &alpha_p[0],
|
||||||
(float **)&Amk[0], lda,
|
(float **)&Amk[0], lda,
|
||||||
@ -459,6 +358,9 @@ public:
|
|||||||
RealD t1=usecond();
|
RealD t1=usecond();
|
||||||
RealD flops = 2.0*m*n*k*batchCount;
|
RealD flops = 2.0*m*n*k*batchCount;
|
||||||
RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
|
RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
|
||||||
|
// std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
|
||||||
|
// std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
|
// std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -466,9 +368,7 @@ public:
|
|||||||
// Double precision real GEMM
|
// Double precision real GEMM
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
void gemmBatched(GridBLASOperation_t OpA,
|
void gemmBatched(int m,int n, int k,
|
||||||
GridBLASOperation_t OpB,
|
|
||||||
int m,int n, int k,
|
|
||||||
RealD alpha,
|
RealD alpha,
|
||||||
deviceVector<RealD*> &Amk, // pointer list to matrices
|
deviceVector<RealD*> &Amk, // pointer list to matrices
|
||||||
deviceVector<RealD*> &Bkn,
|
deviceVector<RealD*> &Bkn,
|
||||||
@ -477,33 +377,20 @@ public:
|
|||||||
{
|
{
|
||||||
RealD t2=usecond();
|
RealD t2=usecond();
|
||||||
int32_t batchCount = Amk.size();
|
int32_t batchCount = Amk.size();
|
||||||
|
// Use C-row major storage, so transpose calls
|
||||||
int lda = m; // m x k column major
|
int lda = m; // m x k column major
|
||||||
int ldb = k; // k x n column major
|
int ldb = k; // k x n column major
|
||||||
int ldc = m; // m x b column major
|
int ldc = m; // m x b column major
|
||||||
if(OpA!=GridBLAS_OP_N)
|
|
||||||
lda = k;
|
|
||||||
if(OpB!=GridBLAS_OP_N)
|
|
||||||
ldb = n;
|
|
||||||
|
|
||||||
static deviceVector<RealD> alpha_p(1);
|
static deviceVector<RealD> alpha_p(1);
|
||||||
static deviceVector<RealD> beta_p(1);
|
static deviceVector<RealD> beta_p(1);
|
||||||
// can prestore the 1 and the zero on device
|
// can prestore the 1 and the zero on device
|
||||||
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD));
|
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD));
|
||||||
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
|
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
|
||||||
RealD t0=usecond();
|
RealD t0=usecond();
|
||||||
|
// std::cout << "hipblasZgemmBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
|
||||||
assert(Bkn.size()==batchCount);
|
assert(Bkn.size()==batchCount);
|
||||||
assert(Cmn.size()==batchCount);
|
assert(Cmn.size()==batchCount);
|
||||||
#ifdef GRID_HIP
|
#ifdef GRID_HIP
|
||||||
hipblasOperation_t hOpA;
|
|
||||||
hipblasOperation_t hOpB;
|
|
||||||
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
|
|
||||||
auto err = hipblasDgemmBatched(gridblasHandle,
|
auto err = hipblasDgemmBatched(gridblasHandle,
|
||||||
HIPBLAS_OP_N,
|
HIPBLAS_OP_N,
|
||||||
HIPBLAS_OP_N,
|
HIPBLAS_OP_N,
|
||||||
@ -517,17 +404,9 @@ public:
|
|||||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
cublasOperation_t hOpA;
|
|
||||||
cublasOperation_t hOpB;
|
|
||||||
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
|
|
||||||
auto err = cublasDgemmBatched(gridblasHandle,
|
auto err = cublasDgemmBatched(gridblasHandle,
|
||||||
hOpA,
|
CUBLAS_OP_N,
|
||||||
hOpB,
|
CUBLAS_OP_N,
|
||||||
m,n,k,
|
m,n,k,
|
||||||
(double *) &alpha_p[0],
|
(double *) &alpha_p[0],
|
||||||
(double **)&Amk[0], lda,
|
(double **)&Amk[0], lda,
|
||||||
@ -573,6 +452,9 @@ public:
|
|||||||
RealD t1=usecond();
|
RealD t1=usecond();
|
||||||
RealD flops = 2.0*m*n*k*batchCount;
|
RealD flops = 2.0*m*n*k*batchCount;
|
||||||
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
|
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
|
||||||
|
// std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
|
||||||
|
// std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
|
// std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -647,36 +529,6 @@ public:
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void benchmark(int nbasis, int nrhs, int coarseVol, int nstencil)
|
|
||||||
{
|
|
||||||
int32_t N_A = nbasis*nbasis*coarseVol*nstencil;
|
|
||||||
int32_t N_B = nbasis*nrhs*coarseVol*nstencil; // One leg of stencil at a time
|
|
||||||
int32_t N_C = nbasis*nrhs*coarseVol*nstencil;
|
|
||||||
deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD));
|
|
||||||
deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD));
|
|
||||||
deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD));
|
|
||||||
ComplexD alpha(1.0);
|
|
||||||
ComplexD beta (1.0);
|
|
||||||
for(int i=0;i<10;i++){
|
|
||||||
RealD t0 = usecond();
|
|
||||||
for(int s=0;s<nstencil;s++){
|
|
||||||
gemmStridedBatched(nbasis,nrhs,nbasis,
|
|
||||||
alpha,
|
|
||||||
&A[0], // m x k
|
|
||||||
&B[0], // k x n
|
|
||||||
beta,
|
|
||||||
&C[0], // m x n
|
|
||||||
coarseVol);
|
|
||||||
}
|
|
||||||
synchronise();
|
|
||||||
RealD t1 = usecond();
|
|
||||||
RealD flops = 8.0*nbasis*nbasis*nrhs*coarseVol*nstencil;
|
|
||||||
RealD bytes = 1.0*sizeof(ComplexD)*(nbasis*nbasis+nbasis*nrhs*3)*coarseVol*nstencil;
|
|
||||||
std::cout << " batched Blas call "<<i<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
|
||||||
std::cout << " batched Blas call "<<i<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -80,7 +80,7 @@ public:
|
|||||||
for(int p=0;p<geom.npoint;p++){
|
for(int p=0;p<geom.npoint;p++){
|
||||||
if ( zero_shift==geom.shifts[p] ) {
|
if ( zero_shift==geom.shifts[p] ) {
|
||||||
_A[p] = _A[p]+shift;
|
_A[p] = _A[p]+shift;
|
||||||
// _Adag[p] = _Adag[p]+shift;
|
_Adag[p] = _Adag[p]+shift;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -94,7 +94,7 @@ public:
|
|||||||
// Avoids brutal handling of Grid pointers
|
// Avoids brutal handling of Grid pointers
|
||||||
if ( CopyMe.geom.shifts[pp]==geom.shifts[p] ) {
|
if ( CopyMe.geom.shifts[pp]==geom.shifts[p] ) {
|
||||||
_A[p] = CopyMe.Cell.Extract(CopyMe._A[pp]);
|
_A[p] = CopyMe.Cell.Extract(CopyMe._A[pp]);
|
||||||
// _Adag[p] = CopyMe.Cell.Extract(CopyMe._Adag[pp]);
|
_Adag[p] = CopyMe.Cell.Extract(CopyMe._Adag[pp]);
|
||||||
nfound++;
|
nfound++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -115,7 +115,7 @@ public:
|
|||||||
int npoint = _geom.npoint;
|
int npoint = _geom.npoint;
|
||||||
}
|
}
|
||||||
_A.resize(geom.npoint,CoarseGrid);
|
_A.resize(geom.npoint,CoarseGrid);
|
||||||
// _Adag.resize(geom.npoint,CoarseGrid);
|
_Adag.resize(geom.npoint,CoarseGrid);
|
||||||
}
|
}
|
||||||
void M (const CoarseVector &in, CoarseVector &out)
|
void M (const CoarseVector &in, CoarseVector &out)
|
||||||
{
|
{
|
||||||
@ -123,10 +123,8 @@ public:
|
|||||||
}
|
}
|
||||||
void Mdag (const CoarseVector &in, CoarseVector &out)
|
void Mdag (const CoarseVector &in, CoarseVector &out)
|
||||||
{
|
{
|
||||||
assert(hermitian);
|
if ( hermitian ) M(in,out);
|
||||||
Mult(_A,in,out);
|
else Mult(_Adag,in,out);
|
||||||
// if ( hermitian ) M(in,out);
|
|
||||||
// else Mult(_Adag,in,out);
|
|
||||||
}
|
}
|
||||||
void Mult (std::vector<CoarseMatrix> &A,const CoarseVector &in, CoarseVector &out)
|
void Mult (std::vector<CoarseMatrix> &A,const CoarseVector &in, CoarseVector &out)
|
||||||
{
|
{
|
||||||
@ -301,145 +299,6 @@ public:
|
|||||||
*
|
*
|
||||||
* Where q_k = delta_k . (2*M_PI/global_nb[mu])
|
* Where q_k = delta_k . (2*M_PI/global_nb[mu])
|
||||||
*/
|
*/
|
||||||
#if 0
|
|
||||||
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
|
|
||||||
Aggregation<Fobj,CComplex,nbasis> & Subspace)
|
|
||||||
{
|
|
||||||
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
|
|
||||||
GridBase *grid = FineGrid();
|
|
||||||
|
|
||||||
RealD tproj=0.0;
|
|
||||||
RealD teigen=0.0;
|
|
||||||
RealD tmat=0.0;
|
|
||||||
RealD tphase=0.0;
|
|
||||||
RealD tinv=0.0;
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////
|
|
||||||
// Orthogonalise the subblocks over the basis
|
|
||||||
/////////////////////////////////////////////////////////////
|
|
||||||
CoarseScalar InnerProd(CoarseGrid());
|
|
||||||
blockOrthogonalise(InnerProd,Subspace.subspace);
|
|
||||||
|
|
||||||
const int npoint = geom.npoint;
|
|
||||||
|
|
||||||
Coordinate clatt = CoarseGrid()->GlobalDimensions();
|
|
||||||
int Nd = CoarseGrid()->Nd();
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
|
|
||||||
* Matrix index i is mapped to this shift via
|
|
||||||
* geom.shifts[i]
|
|
||||||
*
|
|
||||||
* conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block]
|
|
||||||
* = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} >
|
|
||||||
* = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
|
|
||||||
* = M_{kl} A_ji^{b.b+l}
|
|
||||||
*
|
|
||||||
* Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
|
|
||||||
*
|
|
||||||
* Where q_k = delta_k . (2*M_PI/global_nb[mu])
|
|
||||||
*
|
|
||||||
* Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
|
|
||||||
*/
|
|
||||||
teigen-=usecond();
|
|
||||||
Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint);
|
|
||||||
Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
|
|
||||||
ComplexD ci(0.0,1.0);
|
|
||||||
for(int k=0;k<npoint;k++){ // Loop over momenta
|
|
||||||
|
|
||||||
for(int l=0;l<npoint;l++){ // Loop over nbr relative
|
|
||||||
ComplexD phase(0.0,0.0);
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
|
|
||||||
phase=phase+TwoPiL*geom.shifts[k][mu]*geom.shifts[l][mu];
|
|
||||||
}
|
|
||||||
phase=exp(phase*ci);
|
|
||||||
Mkl(k,l) = phase;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
invMkl = Mkl.inverse();
|
|
||||||
teigen+=usecond();
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////
|
|
||||||
// Now compute the matrix elements of linop between the orthonormal
|
|
||||||
// set of vectors.
|
|
||||||
///////////////////////////////////////////////////////////////////////
|
|
||||||
FineField phaV(grid); // Phased block basis vector
|
|
||||||
FineField MphaV(grid);// Matrix applied
|
|
||||||
CoarseVector coarseInner(CoarseGrid());
|
|
||||||
|
|
||||||
std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid());
|
|
||||||
std::vector<CoarseVector> FT(npoint,CoarseGrid());
|
|
||||||
for(int i=0;i<nbasis;i++){// Loop over basis vectors
|
|
||||||
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
|
|
||||||
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
|
|
||||||
/////////////////////////////////////////////////////
|
|
||||||
// Stick a phase on every block
|
|
||||||
/////////////////////////////////////////////////////
|
|
||||||
tphase-=usecond();
|
|
||||||
CoarseComplexField coor(CoarseGrid());
|
|
||||||
CoarseComplexField pha(CoarseGrid()); pha=Zero();
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
LatticeCoordinate(coor,mu);
|
|
||||||
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
|
|
||||||
pha = pha + (TwoPiL * geom.shifts[p][mu]) * coor;
|
|
||||||
}
|
|
||||||
pha =exp(pha*ci);
|
|
||||||
phaV=Zero();
|
|
||||||
blockZAXPY(phaV,pha,Subspace.subspace[i],phaV);
|
|
||||||
tphase+=usecond();
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////
|
|
||||||
// Multiple phased subspace vector by matrix and project to subspace
|
|
||||||
// Remove local bulk phase to leave relative phases
|
|
||||||
/////////////////////////////////////////////////////////////////////
|
|
||||||
tmat-=usecond();
|
|
||||||
linop.Op(phaV,MphaV);
|
|
||||||
tmat+=usecond();
|
|
||||||
|
|
||||||
tproj-=usecond();
|
|
||||||
blockProject(coarseInner,MphaV,Subspace.subspace);
|
|
||||||
coarseInner = conjugate(pha) * coarseInner;
|
|
||||||
|
|
||||||
ComputeProj[p] = coarseInner;
|
|
||||||
tproj+=usecond();
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
tinv-=usecond();
|
|
||||||
for(int k=0;k<npoint;k++){
|
|
||||||
FT[k] = Zero();
|
|
||||||
for(int l=0;l<npoint;l++){
|
|
||||||
FT[k]= FT[k]+ invMkl(l,k)*ComputeProj[l];
|
|
||||||
}
|
|
||||||
|
|
||||||
int osites=CoarseGrid()->oSites();
|
|
||||||
autoView( A_v , _A[k], AcceleratorWrite);
|
|
||||||
autoView( FT_v , FT[k], AcceleratorRead);
|
|
||||||
accelerator_for(sss, osites, 1, {
|
|
||||||
for(int j=0;j<nbasis;j++){
|
|
||||||
A_v[sss](i,j) = FT_v[sss](j);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
tinv+=usecond();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Only needed if nonhermitian
|
|
||||||
if ( ! hermitian ) {
|
|
||||||
// std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
|
|
||||||
// PopulateAdag();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Need to write something to populate Adag from A
|
|
||||||
ExchangeCoarseLinks();
|
|
||||||
std::cout << GridLogMessage<<"CoarsenOperator eigen "<<teigen<<" us"<<std::endl;
|
|
||||||
std::cout << GridLogMessage<<"CoarsenOperator phase "<<tphase<<" us"<<std::endl;
|
|
||||||
std::cout << GridLogMessage<<"CoarsenOperator mat "<<tmat <<" us"<<std::endl;
|
|
||||||
std::cout << GridLogMessage<<"CoarsenOperator proj "<<tproj<<" us"<<std::endl;
|
|
||||||
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
|
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
|
||||||
Aggregation<Fobj,CComplex,nbasis> & Subspace)
|
Aggregation<Fobj,CComplex,nbasis> & Subspace)
|
||||||
{
|
{
|
||||||
@ -550,7 +409,7 @@ public:
|
|||||||
tmat+=usecond();
|
tmat+=usecond();
|
||||||
|
|
||||||
tproj-=usecond();
|
tproj-=usecond();
|
||||||
blockProject(coarseInner,MphaV,Subspace.subspace);
|
blockProjectFast(coarseInner,MphaV,Subspace.subspace);
|
||||||
coarseInner = conjugate(pha[p]) * coarseInner;
|
coarseInner = conjugate(pha[p]) * coarseInner;
|
||||||
|
|
||||||
ComputeProj[p] = coarseInner;
|
ComputeProj[p] = coarseInner;
|
||||||
@ -579,8 +438,8 @@ public:
|
|||||||
|
|
||||||
// Only needed if nonhermitian
|
// Only needed if nonhermitian
|
||||||
if ( ! hermitian ) {
|
if ( ! hermitian ) {
|
||||||
// std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
|
std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
|
||||||
// PopulateAdag();
|
PopulateAdag();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Need to write something to populate Adag from A
|
// Need to write something to populate Adag from A
|
||||||
@ -592,11 +451,10 @@ public:
|
|||||||
std::cout << GridLogMessage<<"CoarsenOperator proj "<<tproj<<" us"<<std::endl;
|
std::cout << GridLogMessage<<"CoarsenOperator proj "<<tproj<<" us"<<std::endl;
|
||||||
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
|
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
void ExchangeCoarseLinks(void){
|
void ExchangeCoarseLinks(void){
|
||||||
for(int p=0;p<geom.npoint;p++){
|
for(int p=0;p<geom.npoint;p++){
|
||||||
_A[p] = Cell.ExchangePeriodic(_A[p]);
|
_A[p] = Cell.ExchangePeriodic(_A[p]);
|
||||||
// _Adag[p]= Cell.ExchangePeriodic(_Adag[p]);
|
_Adag[p]= Cell.ExchangePeriodic(_Adag[p]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
virtual void Mdiag (const Field &in, Field &out){ assert(0);};
|
virtual void Mdiag (const Field &in, Field &out){ assert(0);};
|
||||||
|
@ -27,10 +27,26 @@ Author: Peter Boyle <pboyle@bnl.gov>
|
|||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <Grid/algorithms/multigrid/BatchedBlas.h>
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
|
||||||
|
// Move this to accelerator.h
|
||||||
|
// Also give a copy device.
|
||||||
|
// Rename acceleratorPut
|
||||||
|
// Rename acceleratorGet
|
||||||
|
template<class T> void deviceSet(T& dev,T&host)
|
||||||
|
{
|
||||||
|
acceleratorCopyToDevice(&host,&dev,sizeof(T));
|
||||||
|
}
|
||||||
|
template<class T> T deviceGet(T& dev)
|
||||||
|
{
|
||||||
|
T host;
|
||||||
|
acceleratorCopyFromDevice(&dev,&host,sizeof(T));
|
||||||
|
return host;
|
||||||
|
}
|
||||||
|
|
||||||
// Fine Object == (per site) type of fine field
|
// Fine Object == (per site) type of fine field
|
||||||
// nbasis == number of deflation vectors
|
// nbasis == number of deflation vectors
|
||||||
template<class Fobj,class CComplex,int nbasis>
|
template<class Fobj,class CComplex,int nbasis>
|
||||||
@ -117,12 +133,14 @@ public:
|
|||||||
for(int p=0;p<geom.npoint;p++){
|
for(int p=0;p<geom.npoint;p++){
|
||||||
for(int ss=0;ss<unpadded_sites;ss++){
|
for(int ss=0;ss<unpadded_sites;ss++){
|
||||||
ComplexD *ptr = (ComplexD *)&BLAS_A[p][ss];
|
ComplexD *ptr = (ComplexD *)&BLAS_A[p][ss];
|
||||||
acceleratorPut(BLAS_AP[p][ss],ptr);
|
//ComplexD *ptr = (ComplexD *)&BLAS_A[p][0]; std::cout << " A ptr "<<std::hex<<ptr<<std::dec<<" "<<ss<<"/"<<BLAS_A[p].size()<<std::endl;
|
||||||
|
deviceSet(BLAS_AP[p][ss],ptr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for(int ss=0;ss<unpadded_sites;ss++){
|
for(int ss=0;ss<unpadded_sites;ss++){
|
||||||
ComplexD *ptr = (ComplexD *)&BLAS_C[ss*nrhs];
|
ComplexD *ptr = (ComplexD *)&BLAS_C[ss*nrhs];
|
||||||
acceleratorPut(BLAS_CP[ss],ptr);
|
//ComplexD *ptr = (ComplexD *)&BLAS_C[0]; std::cout << " C ptr "<<std::hex<<ptr<<std::dec<<" "<<ss<<"/"<<BLAS_C.size()<<std::endl;
|
||||||
|
deviceSet(BLAS_CP[ss],ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
/////////////////////////////////////////////////
|
/////////////////////////////////////////////////
|
||||||
@ -137,14 +155,19 @@ public:
|
|||||||
ghost_zone=1; // If general stencil wrapped in any direction, wrap=1
|
ghost_zone=1; // If general stencil wrapped in any direction, wrap=1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// GeneralStencilEntryReordered tmp;
|
||||||
if( ghost_zone==0) {
|
if( ghost_zone==0) {
|
||||||
for(int32_t point = 0 ; point < geom.npoint; point++){
|
for(int32_t point = 0 ; point < geom.npoint; point++){
|
||||||
int i=s*orhs*geom.npoint+point;
|
int i=s*orhs*geom.npoint+point;
|
||||||
int32_t nbr = Stencil._entries[i]._offset*CComplex::Nsimd(); // oSite -> lSite
|
int32_t nbr = Stencil._entries[i]._offset*CComplex::Nsimd(); // oSite -> lSite
|
||||||
|
// std::cout << " B ptr "<< nbr<<"/"<<BLAS_B.size()<<std::endl;
|
||||||
assert(nbr<BLAS_B.size());
|
assert(nbr<BLAS_B.size());
|
||||||
ComplexD * ptr = (ComplexD *)&BLAS_B[nbr];
|
ComplexD * ptr = (ComplexD *)&BLAS_B[nbr];
|
||||||
acceleratorPut(BLAS_BP[point][j],ptr); // neighbour indexing in ghost zone volume
|
// ComplexD * ptr = (ComplexD *)&BLAS_B[0];
|
||||||
|
// std::cout << " B ptr unpadded "<<std::hex<<ptr<<std::dec<<" "<<s<<"/"<<padded_sites<<std::endl;
|
||||||
|
// std::cout << " B ptr padded "<<std::hex<<ptr<<std::dec<<" "<<j<<"/"<<unpadded_sites<<std::endl;
|
||||||
|
deviceSet(BLAS_BP[point][j],ptr); // neighbour indexing in ghost zone volume
|
||||||
|
// auto tmp = deviceGet(*BLAS_BP[point][j]); // debug trigger SEGV if bad ptr
|
||||||
}
|
}
|
||||||
j++;
|
j++;
|
||||||
}
|
}
|
||||||
@ -213,6 +236,7 @@ public:
|
|||||||
#if 0
|
#if 0
|
||||||
std::vector<typename vobj::scalar_object> tmp;
|
std::vector<typename vobj::scalar_object> tmp;
|
||||||
tmp.resize(in.size());
|
tmp.resize(in.size());
|
||||||
|
// std::cout << "BLAStoGrid volume " <<tmp.size()<<" "<< grid.Grid()->lSites()<<std::endl;
|
||||||
assert(in.size()==grid.Grid()->lSites());
|
assert(in.size()==grid.Grid()->lSites());
|
||||||
acceleratorCopyFromDevice(&in[0],&tmp[0],sizeof(typename vobj::scalar_object)*in.size());
|
acceleratorCopyFromDevice(&in[0],&tmp[0],sizeof(typename vobj::scalar_object)*in.size());
|
||||||
vectorizeFromLexOrdArray(tmp,grid);
|
vectorizeFromLexOrdArray(tmp,grid);
|
||||||
@ -265,10 +289,19 @@ public:
|
|||||||
}
|
}
|
||||||
void CopyMatrix (void)
|
void CopyMatrix (void)
|
||||||
{
|
{
|
||||||
|
// Clone "A" to be lexicographic in the physics coords
|
||||||
|
// Use unvectorisetolexordarray
|
||||||
|
// Copy to device
|
||||||
for(int p=0;p<geom.npoint;p++){
|
for(int p=0;p<geom.npoint;p++){
|
||||||
//Unpadded
|
//Unpadded
|
||||||
auto Aup = _Op.Cell.Extract(_Op._A[p]);
|
auto Aup = _Op.Cell.Extract(_Op._A[p]);
|
||||||
|
// Coordinate coor({0,0,0,0,0});
|
||||||
|
// auto sval = peekSite(Aup,coor);
|
||||||
|
// std::cout << "CopyMatrix: p "<<p<<" Aup[0] :"<<sval<<std::endl;
|
||||||
|
// sval = peekSite(_Op._A[p],coor);
|
||||||
|
// std::cout << "CopyMatrix: p "<<p<<" _Op._Ap[0] :"<<sval<<std::endl;
|
||||||
GridtoBLAS(Aup,BLAS_A[p]);
|
GridtoBLAS(Aup,BLAS_A[p]);
|
||||||
|
// std::cout << "Copy Matrix p "<<p<<" "<< deviceGet(BLAS_A[p][0])<<std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void Mdag(const CoarseVector &in, CoarseVector &out)
|
void Mdag(const CoarseVector &in, CoarseVector &out)
|
||||||
@ -277,7 +310,7 @@ public:
|
|||||||
}
|
}
|
||||||
void M (const CoarseVector &in, CoarseVector &out)
|
void M (const CoarseVector &in, CoarseVector &out)
|
||||||
{
|
{
|
||||||
// std::cout << GridLogMessage << "New Mrhs coarse"<<std::endl;
|
std::cout << GridLogMessage << "New Mrhs coarse"<<std::endl;
|
||||||
conformable(CoarseGrid(),in.Grid());
|
conformable(CoarseGrid(),in.Grid());
|
||||||
conformable(in.Grid(),out.Grid());
|
conformable(in.Grid(),out.Grid());
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
@ -313,8 +346,11 @@ public:
|
|||||||
int64_t nrhs =pin.Grid()->GlobalDimensions()[0];
|
int64_t nrhs =pin.Grid()->GlobalDimensions()[0];
|
||||||
assert(nrhs>=1);
|
assert(nrhs>=1);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "New Mrhs GridtoBLAS in sizes "<<in.Grid()->lSites()<<" "<<pin.Grid()->lSites()<<std::endl;
|
||||||
t_GtoB=-usecond();
|
t_GtoB=-usecond();
|
||||||
GridtoBLAS(pin,BLAS_B);
|
GridtoBLAS(pin,BLAS_B);
|
||||||
|
// out = Zero();
|
||||||
|
// GridtoBLAS(out,BLAS_C);
|
||||||
t_GtoB+=usecond();
|
t_GtoB+=usecond();
|
||||||
|
|
||||||
GridBLAS BLAS;
|
GridBLAS BLAS;
|
||||||
@ -324,7 +360,7 @@ public:
|
|||||||
RealD c = 1.0;
|
RealD c = 1.0;
|
||||||
if (p==0) c = 0.0;
|
if (p==0) c = 0.0;
|
||||||
ComplexD beta(c);
|
ComplexD beta(c);
|
||||||
|
// std::cout << GridLogMessage << "New Mrhs coarse gemmBatched "<<p<<std::endl;
|
||||||
BLAS.gemmBatched(nbasis,nrhs,nbasis,
|
BLAS.gemmBatched(nbasis,nrhs,nbasis,
|
||||||
ComplexD(1.0),
|
ComplexD(1.0),
|
||||||
BLAS_AP[p],
|
BLAS_AP[p],
|
||||||
@ -334,12 +370,16 @@ public:
|
|||||||
}
|
}
|
||||||
BLAS.synchronise();
|
BLAS.synchronise();
|
||||||
t_mult+=usecond();
|
t_mult+=usecond();
|
||||||
|
// std::cout << GridLogMessage << "New Mrhs coarse BLAStoGrid "<<std::endl;
|
||||||
t_BtoG=-usecond();
|
t_BtoG=-usecond();
|
||||||
BLAStoGrid(out,BLAS_C);
|
BLAStoGrid(out,BLAS_C);
|
||||||
t_BtoG+=usecond();
|
t_BtoG+=usecond();
|
||||||
t_tot+=usecond();
|
t_tot+=usecond();
|
||||||
|
// auto check =deviceGet(BLAS_C[0]);
|
||||||
|
// std::cout << "C[0] "<<check<<std::endl;
|
||||||
|
// Coordinate coor({0,0,0,0,0,0});
|
||||||
|
// peekLocalSite(check,out,coor);
|
||||||
|
// std::cout << "C[0] "<< check<<std::endl;
|
||||||
std::cout << GridLogMessage << "New Mrhs coarse DONE "<<std::endl;
|
std::cout << GridLogMessage << "New Mrhs coarse DONE "<<std::endl;
|
||||||
std::cout << GridLogMessage<<"Coarse Mult exch "<<t_exch<<" us"<<std::endl;
|
std::cout << GridLogMessage<<"Coarse Mult exch "<<t_exch<<" us"<<std::endl;
|
||||||
std::cout << GridLogMessage<<"Coarse Mult mult "<<t_mult<<" us"<<std::endl;
|
std::cout << GridLogMessage<<"Coarse Mult mult "<<t_mult<<" us"<<std::endl;
|
||||||
@ -351,7 +391,7 @@ public:
|
|||||||
std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/t_mult<<" mflop/s"<<std::endl;
|
std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/t_mult<<" mflop/s"<<std::endl;
|
||||||
std::cout << GridLogMessage<<"Coarse Kernel bytes/s "<< bytes/t_mult/1000<<" GB/s"<<std::endl;
|
std::cout << GridLogMessage<<"Coarse Kernel bytes/s "<< bytes/t_mult/1000<<" GB/s"<<std::endl;
|
||||||
std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl;
|
std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl;
|
||||||
// std::cout << GridLogMessage<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
|
std::cout << GridLogMessage<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
|
||||||
};
|
};
|
||||||
virtual void Mdiag (const Field &in, Field &out){ assert(0);};
|
virtual void Mdiag (const Field &in, Field &out){ assert(0);};
|
||||||
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);};
|
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);};
|
||||||
|
@ -29,6 +29,7 @@ Author: Peter Boyle <pboyle@bnl.gov>
|
|||||||
|
|
||||||
#include <Grid/algorithms/multigrid/Aggregates.h>
|
#include <Grid/algorithms/multigrid/Aggregates.h>
|
||||||
#include <Grid/algorithms/multigrid/Geometry.h>
|
#include <Grid/algorithms/multigrid/Geometry.h>
|
||||||
|
#include <Grid/algorithms/multigrid/BatchedBlas.h>
|
||||||
#include <Grid/algorithms/multigrid/CoarsenedMatrix.h>
|
#include <Grid/algorithms/multigrid/CoarsenedMatrix.h>
|
||||||
#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h>
|
#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h>
|
||||||
#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h>
|
#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h>
|
||||||
|
@ -474,7 +474,6 @@ void MemoryManager::Print(void)
|
|||||||
std::cout << GridLogMessage << DeviceEvictions << " Evictions from device " << std::endl;
|
std::cout << GridLogMessage << DeviceEvictions << " Evictions from device " << std::endl;
|
||||||
std::cout << GridLogMessage << DeviceDestroy << " Destroyed vectors on device " << std::endl;
|
std::cout << GridLogMessage << DeviceDestroy << " Destroyed vectors on device " << std::endl;
|
||||||
std::cout << GridLogMessage << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
|
std::cout << GridLogMessage << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
|
||||||
acceleratorMem();
|
|
||||||
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
|
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
|
||||||
}
|
}
|
||||||
void MemoryManager::PrintAll(void)
|
void MemoryManager::PrintAll(void)
|
||||||
|
@ -300,6 +300,38 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
// std::cout << GridLogPerformance << " blockProject : conv : "<<t_co<<" us"<<std::endl;
|
// std::cout << GridLogPerformance << " blockProject : conv : "<<t_co<<" us"<<std::endl;
|
||||||
// std::cout << GridLogPerformance << " blockProject : blockZaxpy : "<<t_za<<" us"<<std::endl;
|
// std::cout << GridLogPerformance << " blockProject : blockZaxpy : "<<t_za<<" us"<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<class vobj,class CComplex,int nbasis,class VLattice>
|
||||||
|
inline void blockProjectFast(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||||
|
const Lattice<vobj> &fineData,
|
||||||
|
const VLattice &Basis)
|
||||||
|
{
|
||||||
|
GridBase * fine = fineData.Grid();
|
||||||
|
GridBase * coarse= coarseData.Grid();
|
||||||
|
|
||||||
|
Lattice<iScalar<CComplex>> ip(coarse);
|
||||||
|
Lattice<vobj> fineDataRed = fineData;
|
||||||
|
|
||||||
|
autoView( coarseData_ , coarseData, AcceleratorWrite);
|
||||||
|
autoView( ip_ , ip, AcceleratorWrite);
|
||||||
|
RealD t_IP=0;
|
||||||
|
RealD t_co=0;
|
||||||
|
for(int v=0;v<nbasis;v++) {
|
||||||
|
t_IP-=usecond();
|
||||||
|
blockInnerProductD(ip,Basis[v],fineData); // ip = <basis|fine>
|
||||||
|
t_IP+=usecond();
|
||||||
|
t_co-=usecond();
|
||||||
|
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
|
||||||
|
convertType(coarseData_[sc](v),ip_[sc]);
|
||||||
|
});
|
||||||
|
t_co+=usecond();
|
||||||
|
}
|
||||||
|
// std::cout << GridLogPerformance << " blockProjectFast : blockInnerProduct : "<<t_IP<<" us"<<std::endl;
|
||||||
|
// std::cout << GridLogPerformance << " blockProjectFast : conv : "<<t_co<<" us"<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// This only minimises data motion from CPU to GPU
|
// This only minimises data motion from CPU to GPU
|
||||||
// there is chance of better implementation that does a vxk loop of inner products to data share
|
// there is chance of better implementation that does a vxk loop of inner products to data share
|
||||||
// at the GPU thread level
|
// at the GPU thread level
|
||||||
@ -744,12 +776,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
|||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
|
||||||
const int words=sizeof(vobj)/sizeof(vector_type);
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// the checks should guarantee that the operations are local
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// checks should guarantee that the operations are local
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
GridBase *Fg = From.Grid();
|
GridBase *Fg = From.Grid();
|
||||||
GridBase *Tg = To.Grid();
|
GridBase *Tg = To.Grid();
|
||||||
assert(!Fg->_isCheckerBoarded);
|
assert(!Fg->_isCheckerBoarded);
|
||||||
@ -763,10 +792,12 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
|||||||
for(int d=0;d<nd;d++){
|
for(int d=0;d<nd;d++){
|
||||||
assert(Fg->_processors[d] == Tg->_processors[d]);
|
assert(Fg->_processors[d] == Tg->_processors[d]);
|
||||||
}
|
}
|
||||||
|
size_t nsite = 1;
|
||||||
|
for(int i=0;i<nd;i++) nsite *= RegionSize[i];
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// do the index calc on the GPU
|
// do the index calc on the GPU
|
||||||
///////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
Coordinate f_ostride = Fg->_ostride;
|
Coordinate f_ostride = Fg->_ostride;
|
||||||
Coordinate f_istride = Fg->_istride;
|
Coordinate f_istride = Fg->_istride;
|
||||||
Coordinate f_rdimensions = Fg->_rdimensions;
|
Coordinate f_rdimensions = Fg->_rdimensions;
|
||||||
@ -774,15 +805,13 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
|||||||
Coordinate t_istride = Tg->_istride;
|
Coordinate t_istride = Tg->_istride;
|
||||||
Coordinate t_rdimensions = Tg->_rdimensions;
|
Coordinate t_rdimensions = Tg->_rdimensions;
|
||||||
|
|
||||||
size_t nsite = 1;
|
|
||||||
for(int i=0;i<nd;i++) nsite *= RegionSize[i];
|
|
||||||
|
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
|
||||||
autoView(from_v,From,AcceleratorRead);
|
autoView(from_v,From,AcceleratorRead);
|
||||||
autoView(to_v,To,AcceleratorWrite);
|
autoView(to_v,To,AcceleratorWrite);
|
||||||
|
|
||||||
|
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
accelerator_for(idx,nsite,1,{
|
accelerator_for(idx,nsite,1,{
|
||||||
|
|
||||||
Coordinate from_coor, to_coor, base;
|
Coordinate from_coor, to_coor, base;
|
||||||
@ -807,143 +836,6 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj>
|
|
||||||
void InsertSliceFast(const Lattice<vobj> &From,Lattice<vobj> & To,int slice, int orthog)
|
|
||||||
{
|
|
||||||
typedef typename vobj::scalar_object sobj;
|
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
|
||||||
typedef typename vobj::vector_type vector_type;
|
|
||||||
|
|
||||||
const int words=sizeof(vobj)/sizeof(vector_type);
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// checks should guarantee that the operations are local
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
GridBase *Fg = From.Grid();
|
|
||||||
GridBase *Tg = To.Grid();
|
|
||||||
assert(!Fg->_isCheckerBoarded);
|
|
||||||
assert(!Tg->_isCheckerBoarded);
|
|
||||||
int Nsimd = Fg->Nsimd();
|
|
||||||
int nF = Fg->_ndimension;
|
|
||||||
int nT = Tg->_ndimension;
|
|
||||||
assert(nF+1 == nT);
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////
|
|
||||||
// do the index calc on the GPU
|
|
||||||
///////////////////////////////////////////////////////////
|
|
||||||
Coordinate f_ostride = Fg->_ostride;
|
|
||||||
Coordinate f_istride = Fg->_istride;
|
|
||||||
Coordinate f_rdimensions = Fg->_rdimensions;
|
|
||||||
Coordinate t_ostride = Tg->_ostride;
|
|
||||||
Coordinate t_istride = Tg->_istride;
|
|
||||||
Coordinate t_rdimensions = Tg->_rdimensions;
|
|
||||||
Coordinate RegionSize = Fg->_ldimensions;
|
|
||||||
size_t nsite = 1;
|
|
||||||
for(int i=0;i<nF;i++) nsite *= RegionSize[i]; // whole volume of lower dim grid
|
|
||||||
|
|
||||||
typedef typename vobj::vector_type vector_type;
|
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
|
||||||
|
|
||||||
autoView(from_v,From,AcceleratorRead);
|
|
||||||
autoView(to_v,To,AcceleratorWrite);
|
|
||||||
|
|
||||||
accelerator_for(idx,nsite,1,{
|
|
||||||
|
|
||||||
Coordinate from_coor(nF), to_coor(nT);
|
|
||||||
Lexicographic::CoorFromIndex(from_coor,idx,RegionSize);
|
|
||||||
int j=0;
|
|
||||||
for(int i=0;i<nT;i++){
|
|
||||||
if ( i!=orthog ) {
|
|
||||||
to_coor[i] = from_coor[j];
|
|
||||||
j++;
|
|
||||||
} else {
|
|
||||||
to_coor[i] = slice;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
int from_oidx = 0; for(int d=0;d<nF;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
|
|
||||||
int from_lane = 0; for(int d=0;d<nF;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
|
|
||||||
int to_oidx = 0; for(int d=0;d<nT;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
|
|
||||||
int to_lane = 0; for(int d=0;d<nT;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
|
|
||||||
|
|
||||||
const vector_type* from = (const vector_type *)&from_v[from_oidx];
|
|
||||||
vector_type* to = (vector_type *)&to_v[to_oidx];
|
|
||||||
|
|
||||||
scalar_type stmp;
|
|
||||||
for(int w=0;w<words;w++){
|
|
||||||
stmp = getlane(from[w], from_lane);
|
|
||||||
putlane(to[w], stmp, to_lane);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class vobj>
|
|
||||||
void ExtractSliceFast(Lattice<vobj> &To,const Lattice<vobj> & From,int slice, int orthog)
|
|
||||||
{
|
|
||||||
typedef typename vobj::scalar_object sobj;
|
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
|
||||||
typedef typename vobj::vector_type vector_type;
|
|
||||||
|
|
||||||
const int words=sizeof(vobj)/sizeof(vector_type);
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// checks should guarantee that the operations are local
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
GridBase *Fg = From.Grid();
|
|
||||||
GridBase *Tg = To.Grid();
|
|
||||||
assert(!Fg->_isCheckerBoarded);
|
|
||||||
assert(!Tg->_isCheckerBoarded);
|
|
||||||
int Nsimd = Fg->Nsimd();
|
|
||||||
int nF = Fg->_ndimension;
|
|
||||||
int nT = Tg->_ndimension;
|
|
||||||
assert(nT+1 == nF);
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////
|
|
||||||
// do the index calc on the GPU
|
|
||||||
///////////////////////////////////////////////////////////
|
|
||||||
Coordinate f_ostride = Fg->_ostride;
|
|
||||||
Coordinate f_istride = Fg->_istride;
|
|
||||||
Coordinate f_rdimensions = Fg->_rdimensions;
|
|
||||||
Coordinate t_ostride = Tg->_ostride;
|
|
||||||
Coordinate t_istride = Tg->_istride;
|
|
||||||
Coordinate t_rdimensions = Tg->_rdimensions;
|
|
||||||
Coordinate RegionSize = Tg->_ldimensions;
|
|
||||||
size_t nsite = 1;
|
|
||||||
for(int i=0;i<nT;i++) nsite *= RegionSize[i]; // whole volume of lower dim grid
|
|
||||||
|
|
||||||
typedef typename vobj::vector_type vector_type;
|
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
|
||||||
|
|
||||||
autoView(from_v,From,AcceleratorRead);
|
|
||||||
autoView(to_v,To,AcceleratorWrite);
|
|
||||||
|
|
||||||
accelerator_for(idx,nsite,1,{
|
|
||||||
|
|
||||||
Coordinate from_coor(nF), to_coor(nT);
|
|
||||||
Lexicographic::CoorFromIndex(to_coor,idx,RegionSize);
|
|
||||||
int j=0;
|
|
||||||
for(int i=0;i<nF;i++){
|
|
||||||
if ( i!=orthog ) {
|
|
||||||
from_coor[i] = to_coor[j];
|
|
||||||
j++;
|
|
||||||
} else {
|
|
||||||
from_coor[i] = slice;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
int from_oidx = 0; for(int d=0;d<nF;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
|
|
||||||
int from_lane = 0; for(int d=0;d<nF;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
|
|
||||||
int to_oidx = 0; for(int d=0;d<nT;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
|
|
||||||
int to_lane = 0; for(int d=0;d<nT;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
|
|
||||||
|
|
||||||
const vector_type* from = (const vector_type *)&from_v[from_oidx];
|
|
||||||
vector_type* to = (vector_type *)&to_v[to_oidx];
|
|
||||||
|
|
||||||
scalar_type stmp;
|
|
||||||
for(int w=0;w<words;w++){
|
|
||||||
stmp = getlane(from[w], from_lane);
|
|
||||||
putlane(to[w], stmp, to_lane);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
|
void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
|
||||||
@ -1033,7 +925,9 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//Can I implement with local copyregion??
|
//FIXME: make this run entirely on GPU
|
||||||
|
//Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim
|
||||||
|
//The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
|
void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
|
||||||
{
|
{
|
||||||
@ -1054,18 +948,121 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
|
|||||||
assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
|
assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Coordinate sz = lg->_ldimensions;
|
|
||||||
sz[orthog]=1;
|
#if 1
|
||||||
Coordinate f_ll(nl,0); f_ll[orthog]=slice_lo;
|
size_t nsite = lg->lSites()/lg->LocalDimensions()[orthog];
|
||||||
Coordinate t_ll(nh,0); t_ll[orthog]=slice_hi;
|
size_t tbytes = 4*nsite*sizeof(int);
|
||||||
localCopyRegion(lowDim,higherDim,f_ll,t_ll,sz);
|
int *table = (int*)malloc(tbytes);
|
||||||
|
|
||||||
|
thread_for(idx,nsite,{
|
||||||
|
Coordinate lcoor(nl);
|
||||||
|
Coordinate hcoor(nh);
|
||||||
|
lcoor[orthog] = slice_lo;
|
||||||
|
hcoor[orthog] = slice_hi;
|
||||||
|
size_t rem = idx;
|
||||||
|
for(int mu=0;mu<nl;mu++){
|
||||||
|
if(mu != orthog){
|
||||||
|
int xmu = rem % lg->LocalDimensions()[mu]; rem /= lg->LocalDimensions()[mu];
|
||||||
|
lcoor[mu] = hcoor[mu] = xmu;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int loidx = lg->oIndex(lcoor);
|
||||||
|
int liidx = lg->iIndex(lcoor);
|
||||||
|
int hoidx = hg->oIndex(hcoor);
|
||||||
|
int hiidx = hg->iIndex(hcoor);
|
||||||
|
int* tt = table + 4*idx;
|
||||||
|
tt[0] = loidx;
|
||||||
|
tt[1] = liidx;
|
||||||
|
tt[2] = hoidx;
|
||||||
|
tt[3] = hiidx;
|
||||||
|
});
|
||||||
|
|
||||||
|
int* table_d = (int*)acceleratorAllocDevice(tbytes);
|
||||||
|
acceleratorCopyToDevice(table,table_d,tbytes);
|
||||||
|
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
|
||||||
|
autoView(lowDim_v,lowDim,AcceleratorRead);
|
||||||
|
autoView(higherDim_v,higherDim,AcceleratorWrite);
|
||||||
|
|
||||||
|
accelerator_for(idx,nsite,1,{
|
||||||
|
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
|
int* tt = table_d + 4*idx;
|
||||||
|
int from_oidx = *tt++;
|
||||||
|
int from_lane = *tt++;
|
||||||
|
int to_oidx = *tt++;
|
||||||
|
int to_lane = *tt;
|
||||||
|
|
||||||
|
const vector_type* from = (const vector_type *)&lowDim_v[from_oidx];
|
||||||
|
vector_type* to = (vector_type *)&higherDim_v[to_oidx];
|
||||||
|
|
||||||
|
scalar_type stmp;
|
||||||
|
for(int w=0;w<words;w++){
|
||||||
|
stmp = getlane(from[w], from_lane);
|
||||||
|
putlane(to[w], stmp, to_lane);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
acceleratorFreeDevice(table_d);
|
||||||
|
free(table);
|
||||||
|
|
||||||
|
#else
|
||||||
|
// the above should guarantee that the operations are local
|
||||||
|
autoView(lowDimv,lowDim,CpuRead);
|
||||||
|
autoView(higherDimv,higherDim,CpuWrite);
|
||||||
|
thread_for(idx,lg->lSites(),{
|
||||||
|
sobj s;
|
||||||
|
Coordinate lcoor(nl);
|
||||||
|
Coordinate hcoor(nh);
|
||||||
|
lg->LocalIndexToLocalCoor(idx,lcoor);
|
||||||
|
if( lcoor[orthog] == slice_lo ) {
|
||||||
|
hcoor=lcoor;
|
||||||
|
hcoor[orthog] = slice_hi;
|
||||||
|
peekLocalSite(s,lowDimv,lcoor);
|
||||||
|
pokeLocalSite(s,higherDimv,hcoor);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
|
void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
|
||||||
{
|
{
|
||||||
InsertSliceLocal(higherDim,lowDim,slice_hi,slice_lo,orthog);
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
|
||||||
|
GridBase *lg = lowDim.Grid();
|
||||||
|
GridBase *hg = higherDim.Grid();
|
||||||
|
int nl = lg->_ndimension;
|
||||||
|
int nh = hg->_ndimension;
|
||||||
|
|
||||||
|
assert(nl == nh);
|
||||||
|
assert(orthog<nh);
|
||||||
|
assert(orthog>=0);
|
||||||
|
|
||||||
|
for(int d=0;d<nh;d++){
|
||||||
|
if ( d!=orthog ) {
|
||||||
|
assert(lg->_processors[d] == hg->_processors[d]);
|
||||||
|
assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// the above should guarantee that the operations are local
|
||||||
|
autoView(lowDimv,lowDim,CpuWrite);
|
||||||
|
autoView(higherDimv,higherDim,CpuRead);
|
||||||
|
thread_for(idx,lg->lSites(),{
|
||||||
|
sobj s;
|
||||||
|
Coordinate lcoor(nl);
|
||||||
|
Coordinate hcoor(nh);
|
||||||
|
lg->LocalIndexToLocalCoor(idx,lcoor);
|
||||||
|
if( lcoor[orthog] == slice_lo ) {
|
||||||
|
hcoor=lcoor;
|
||||||
|
hcoor[orthog] = slice_hi;
|
||||||
|
peekLocalSite(s,higherDimv,hcoor);
|
||||||
|
pokeLocalSite(s,lowDimv,lcoor);
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -1778,34 +1775,31 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
|
|||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// Faster but less accurate blockProject
|
// MultiRHS interface support for coarse space
|
||||||
|
// -- Simplest possible implementation to begin with
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
template<class vobj,class CComplex,int nbasis,class VLattice>
|
template<class vobj,class CComplex,int nbasis,class VLattice>
|
||||||
inline void blockProjectFast(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
inline void blockProjectMany(Lattice<iVector<CComplex,nbasis > > &coarseIP,
|
||||||
const Lattice<vobj> &fineData,
|
Lattice<iVector<CComplex,nbasis > > &coarseTMP,
|
||||||
|
const VLattice &fineData, // Basis and fineData necessarily same type
|
||||||
const VLattice &Basis)
|
const VLattice &Basis)
|
||||||
{
|
{
|
||||||
GridBase * fine = fineData.Grid();
|
for(int r=0;r<fineData.size();r++){
|
||||||
GridBase * coarse= coarseData.Grid();
|
blockProject(coarseTMP,fineData[r],Basis);
|
||||||
|
InsertSliceLocal(coarseTMP, coarseIP,r,r,0);
|
||||||
Lattice<iScalar<CComplex> > ip(coarse);
|
}
|
||||||
|
}
|
||||||
autoView( coarseData_ , coarseData, AcceleratorWrite);
|
template<class vobj,class CComplex,int nbasis,class VLattice>
|
||||||
autoView( ip_ , ip, AcceleratorWrite);
|
inline void blockPromoteMany(Lattice<iVector<CComplex,nbasis > > &coarseIP,
|
||||||
RealD t_IP=0;
|
Lattice<iVector<CComplex,nbasis > > &coarseTMP,
|
||||||
RealD t_co=0;
|
const VLattice &fineData, // Basis and fineData necessarily same type
|
||||||
for(int v=0;v<nbasis;v++) {
|
const VLattice &Basis)
|
||||||
t_IP-=usecond();
|
{
|
||||||
blockInnerProductD(ip,Basis[v],fineData);
|
for(int r=0;r<fineData.size();r++){
|
||||||
t_IP+=usecond();
|
ExtractSliceLocal(coarseTMP, coarseIP,r,r,0);
|
||||||
t_co-=usecond();
|
blockPromote(coarseTMP,fineData[r],Basis);
|
||||||
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
|
|
||||||
convertType(coarseData_[sc](v),ip_[sc]);
|
|
||||||
});
|
|
||||||
t_co+=usecond();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -162,14 +162,8 @@ template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
|
|||||||
{
|
{
|
||||||
uint32_t scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
|
uint32_t scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
|
||||||
uint32_t scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
|
uint32_t scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
|
||||||
std::cout << GridLogMessage << " scidacChecksumVerify computed "<<scidac_csuma<<" expected "<<scidac_checksuma <<std::endl;
|
if ( scidac_csuma !=scidac_checksuma) return 0;
|
||||||
std::cout << GridLogMessage << " scidacChecksumVerify computed "<<scidac_csumb<<" expected "<<scidac_checksumb <<std::endl;
|
if ( scidac_csumb !=scidac_checksumb) return 0;
|
||||||
if ( scidac_csuma !=scidac_checksuma) {
|
|
||||||
return 0;
|
|
||||||
};
|
|
||||||
if ( scidac_csumb !=scidac_checksumb) {
|
|
||||||
return 0;
|
|
||||||
};
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -120,7 +120,7 @@ hipStream_t computeStream;
|
|||||||
void acceleratorInit(void)
|
void acceleratorInit(void)
|
||||||
{
|
{
|
||||||
int nDevices = 1;
|
int nDevices = 1;
|
||||||
auto discard = hipGetDeviceCount(&nDevices);
|
hipGetDeviceCount(&nDevices);
|
||||||
gpu_props = new hipDeviceProp_t[nDevices];
|
gpu_props = new hipDeviceProp_t[nDevices];
|
||||||
|
|
||||||
char * localRankStr = NULL;
|
char * localRankStr = NULL;
|
||||||
@ -147,7 +147,7 @@ void acceleratorInit(void)
|
|||||||
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorHipInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
|
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorHipInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
|
||||||
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
|
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
|
||||||
|
|
||||||
discard = hipGetDeviceProperties(&gpu_props[i], i);
|
hipGetDeviceProperties(&gpu_props[i], i);
|
||||||
hipDeviceProp_t prop;
|
hipDeviceProp_t prop;
|
||||||
prop = gpu_props[i];
|
prop = gpu_props[i];
|
||||||
totalDeviceMem = prop.totalGlobalMem;
|
totalDeviceMem = prop.totalGlobalMem;
|
||||||
@ -184,13 +184,13 @@ void acceleratorInit(void)
|
|||||||
}
|
}
|
||||||
int device = rank;
|
int device = rank;
|
||||||
#endif
|
#endif
|
||||||
discard = hipSetDevice(device);
|
hipSetDevice(device);
|
||||||
discard = hipStreamCreate(©Stream);
|
hipStreamCreate(©Stream);
|
||||||
discard = hipStreamCreate(&computeStream);
|
hipStreamCreate(&computeStream);
|
||||||
const int len=64;
|
const int len=64;
|
||||||
char busid[len];
|
char busid[len];
|
||||||
if( rank == world_rank ) {
|
if( rank == world_rank ) {
|
||||||
discard = hipDeviceGetPCIBusId(busid, len, device);
|
hipDeviceGetPCIBusId(busid, len, device);
|
||||||
printf("local rank %d device %d bus id: %s\n", rank, device, busid);
|
printf("local rank %d device %d bus id: %s\n", rank, device, busid);
|
||||||
}
|
}
|
||||||
if ( world_rank == 0 ) printf("AcceleratorHipInit: ================================================\n");
|
if ( world_rank == 0 ) printf("AcceleratorHipInit: ================================================\n");
|
||||||
|
@ -117,7 +117,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
|||||||
#endif
|
#endif
|
||||||
} // CUDA specific
|
} // CUDA specific
|
||||||
|
|
||||||
inline void acceleratorMem(void)
|
inline void cuda_mem(void)
|
||||||
{
|
{
|
||||||
size_t free_t,total_t,used_t;
|
size_t free_t,total_t,used_t;
|
||||||
cudaMemGetInfo(&free_t,&total_t);
|
cudaMemGetInfo(&free_t,&total_t);
|
||||||
@ -125,11 +125,6 @@ inline void acceleratorMem(void)
|
|||||||
std::cout << " MemoryManager : GPU used "<<used_t<<" free "<<free_t<< " total "<<total_t<<std::endl;
|
std::cout << " MemoryManager : GPU used "<<used_t<<" free "<<free_t<< " total "<<total_t<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void cuda_mem(void)
|
|
||||||
{
|
|
||||||
acceleratorMem();
|
|
||||||
}
|
|
||||||
|
|
||||||
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
||||||
{ \
|
{ \
|
||||||
int nt=acceleratorThreads(); \
|
int nt=acceleratorThreads(); \
|
||||||
@ -275,7 +270,6 @@ inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes
|
|||||||
}
|
}
|
||||||
inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
|
inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
|
||||||
|
|
||||||
|
|
||||||
inline int acceleratorIsCommunicable(void *ptr)
|
inline int acceleratorIsCommunicable(void *ptr)
|
||||||
{
|
{
|
||||||
// int uvm=0;
|
// int uvm=0;
|
||||||
@ -311,11 +305,6 @@ NAMESPACE_END(Grid);
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
inline void acceleratorMem(void)
|
|
||||||
{
|
|
||||||
std::cout <<" SYCL acceleratorMem not implemented"<<std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
extern cl::sycl::queue *theGridAccelerator;
|
extern cl::sycl::queue *theGridAccelerator;
|
||||||
extern cl::sycl::queue *theCopyAccelerator;
|
extern cl::sycl::queue *theCopyAccelerator;
|
||||||
|
|
||||||
@ -394,15 +383,6 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
#define accelerator __host__ __device__
|
#define accelerator __host__ __device__
|
||||||
#define accelerator_inline __host__ __device__ inline
|
#define accelerator_inline __host__ __device__ inline
|
||||||
|
|
||||||
inline void acceleratorMem(void)
|
|
||||||
{
|
|
||||||
size_t free_t,total_t,used_t;
|
|
||||||
auto discard = hipMemGetInfo(&free_t,&total_t);
|
|
||||||
used_t=total_t-free_t;
|
|
||||||
std::cout << " MemoryManager : GPU used "<<used_t<<" free "<<free_t<< " total "<<total_t<<std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
extern hipStream_t copyStream;
|
extern hipStream_t copyStream;
|
||||||
extern hipStream_t computeStream;
|
extern hipStream_t computeStream;
|
||||||
/*These routines define mapping from thread grid to loop & vector lane indexing */
|
/*These routines define mapping from thread grid to loop & vector lane indexing */
|
||||||
@ -479,7 +459,7 @@ inline void *acceleratorAllocShared(size_t bytes)
|
|||||||
auto err = hipMallocManaged((void **)&ptr,bytes);
|
auto err = hipMallocManaged((void **)&ptr,bytes);
|
||||||
if( err != hipSuccess ) {
|
if( err != hipSuccess ) {
|
||||||
ptr = (void *) NULL;
|
ptr = (void *) NULL;
|
||||||
fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
|
printf(" hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err));
|
||||||
}
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
};
|
};
|
||||||
@ -491,7 +471,7 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
|||||||
auto err = hipMalloc((void **)&ptr,bytes);
|
auto err = hipMalloc((void **)&ptr,bytes);
|
||||||
if( err != hipSuccess ) {
|
if( err != hipSuccess ) {
|
||||||
ptr = (void *) NULL;
|
ptr = (void *) NULL;
|
||||||
fprintf(stderr," hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
|
printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err));
|
||||||
}
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
};
|
};
|
||||||
@ -534,12 +514,6 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
|
|
||||||
{
|
|
||||||
acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
|
|
||||||
acceleratorCopySynchronise();
|
|
||||||
}
|
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
// CPU Target - No accelerator just thread instead
|
// CPU Target - No accelerator just thread instead
|
||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
@ -549,15 +523,6 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
|
|||||||
#undef GRID_SIMT
|
#undef GRID_SIMT
|
||||||
|
|
||||||
|
|
||||||
inline void acceleratorMem(void)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
struct rusage rusage;
|
|
||||||
getrusage( RUSAGE_SELF, &rusage );
|
|
||||||
return (size_t)rusage.ru_maxrss;
|
|
||||||
*/
|
|
||||||
std::cout <<" system acceleratorMem not implemented"<<std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define accelerator
|
#define accelerator
|
||||||
#define accelerator_inline strong_inline
|
#define accelerator_inline strong_inline
|
||||||
@ -651,17 +616,4 @@ accelerator_inline void acceleratorFence(void)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class T> void acceleratorPut(T& dev,T&host)
|
|
||||||
{
|
|
||||||
acceleratorCopyToDevice(&host,&dev,sizeof(T));
|
|
||||||
}
|
|
||||||
template<class T> T acceleratorGet(T& dev)
|
|
||||||
{
|
|
||||||
T host;
|
|
||||||
acceleratorCopyFromDevice(&dev,&host,sizeof(T));
|
|
||||||
return host;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -414,7 +414,7 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
// Logging
|
// Logging
|
||||||
////////////////////////////////////
|
////////////////////////////////////
|
||||||
std::vector<std::string> logstreams;
|
std::vector<std::string> logstreams;
|
||||||
std::string defaultLog("Error,Warning,Message");
|
std::string defaultLog("Error,Warning,Message,Memory");
|
||||||
GridCmdOptionCSL(defaultLog,logstreams);
|
GridCmdOptionCSL(defaultLog,logstreams);
|
||||||
GridLogConfigure(logstreams);
|
GridLogConfigure(logstreams);
|
||||||
|
|
||||||
|
@ -26,6 +26,11 @@ Author: Peter Boyle <pboyle@bnl.gov>
|
|||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#include <Grid/Grid.h>
|
#include <Grid/Grid.h>
|
||||||
|
#include <Grid/lattice/PaddedCell.h>
|
||||||
|
#include <Grid/stencil/GeneralLocalStencil.h>
|
||||||
|
//#include <Grid/algorithms/GeneralCoarsenedMatrix.h>
|
||||||
|
#include <Grid/algorithms/iterative/AdefGeneric.h>
|
||||||
|
#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
@ -107,44 +112,6 @@ void LoadBasis(aggregation &Agg, std::string file)
|
|||||||
RD.close();
|
RD.close();
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
template<class CoarseVector>
|
|
||||||
void SaveEigenvectors(std::vector<RealD> &eval,
|
|
||||||
std::vector<CoarseVector> &evec,
|
|
||||||
std::string evec_file,
|
|
||||||
std::string eval_file)
|
|
||||||
{
|
|
||||||
#ifdef HAVE_LIME
|
|
||||||
emptyUserRecord record;
|
|
||||||
ScidacWriter WR(evec[0].Grid()->IsBoss());
|
|
||||||
WR.open(evec_file);
|
|
||||||
for(int b=0;b<evec.size();b++){
|
|
||||||
WR.writeScidacFieldRecord(evec[b],record,0,0);
|
|
||||||
}
|
|
||||||
WR.close();
|
|
||||||
XmlWriter WRx(eval_file);
|
|
||||||
write(WRx,"evals",eval);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
template<class CoarseVector>
|
|
||||||
void LoadEigenvectors(std::vector<RealD> &eval,
|
|
||||||
std::vector<CoarseVector> &evec,
|
|
||||||
std::string evec_file,
|
|
||||||
std::string eval_file)
|
|
||||||
{
|
|
||||||
#ifdef HAVE_LIME
|
|
||||||
XmlReader RDx(eval_file);
|
|
||||||
read(RDx,"evals",eval);
|
|
||||||
emptyUserRecord record;
|
|
||||||
|
|
||||||
Grid::ScidacReader RD ;
|
|
||||||
RD.open(evec_file);
|
|
||||||
assert(evec.size()==eval.size());
|
|
||||||
for(int k=0;k<eval.size();k++) {
|
|
||||||
RD.readScidacFieldRecord(evec[k],record);
|
|
||||||
}
|
|
||||||
RD.close();
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
RealD InverseApproximation(RealD x){
|
RealD InverseApproximation(RealD x){
|
||||||
return 1.0/x;
|
return 1.0/x;
|
||||||
@ -202,7 +169,6 @@ public:
|
|||||||
void operator() (const Field &in, Field &out)
|
void operator() (const Field &in, Field &out)
|
||||||
{
|
{
|
||||||
ConjugateGradient<Field> CG(0.0,iters,false); // non-converge is just fine in a smoother
|
ConjugateGradient<Field> CG(0.0,iters,false); // non-converge is just fine in a smoother
|
||||||
out=Zero();
|
|
||||||
CG(_SmootherOperator,in,out);
|
CG(_SmootherOperator,in,out);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -216,10 +182,9 @@ int main (int argc, char ** argv)
|
|||||||
Grid_init(&argc,&argv);
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
const int Ls=24;
|
const int Ls=24;
|
||||||
const int nbasis = 62;
|
// const int nbasis = 62;
|
||||||
// const int nbasis = 56;
|
// const int nbasis = 56;
|
||||||
// const int nbasis = 44;
|
const int nbasis = 36;
|
||||||
// const int nbasis = 36;
|
|
||||||
const int cb = 0 ;
|
const int cb = 0 ;
|
||||||
RealD mass=0.00078;
|
RealD mass=0.00078;
|
||||||
RealD M5=1.8;
|
RealD M5=1.8;
|
||||||
@ -271,8 +236,12 @@ int main (int argc, char ** argv)
|
|||||||
typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
|
typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
|
||||||
HermFineMatrix FineHermOp(HermOpEO);
|
HermFineMatrix FineHermOp(HermOpEO);
|
||||||
|
|
||||||
|
LatticeFermion result(FrbGrid); result=Zero();
|
||||||
|
|
||||||
|
LatticeFermion src(FrbGrid); random(RNG5,src);
|
||||||
|
|
||||||
// Run power method on FineHermOp
|
// Run power method on FineHermOp
|
||||||
// PowerMethod<LatticeFermion> PM; PM(HermOpEO,src);
|
PowerMethod<LatticeFermion> PM; PM(HermOpEO,src);
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
///////////// Coarse basis and Little Dirac Operator ///////
|
///////////// Coarse basis and Little Dirac Operator ///////
|
||||||
@ -292,22 +261,15 @@ int main (int argc, char ** argv)
|
|||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d);
|
LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d);
|
||||||
|
|
||||||
std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.rat.18node.62");
|
std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.rat.scidac.62");
|
||||||
std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys48.rat.18node.62");
|
std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys48.rat.scidac.62");
|
||||||
std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys48.rat.18node.62");
|
std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys48.rat.scidac.62");
|
||||||
std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac");
|
|
||||||
std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml");
|
|
||||||
bool load_agg=true;
|
bool load_agg=true;
|
||||||
bool load_refine=true;
|
bool load_refine=true;
|
||||||
bool load_mat=true;
|
bool load_mat=false;
|
||||||
bool load_evec=false;
|
|
||||||
MemoryManager::Print();
|
MemoryManager::Print();
|
||||||
|
|
||||||
int refine=1;
|
|
||||||
if ( load_agg ) {
|
if ( load_agg ) {
|
||||||
if ( !(refine) || (!load_refine) ) {
|
|
||||||
LoadBasis(Aggregates,subspace_file);
|
LoadBasis(Aggregates,subspace_file);
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
// NBASIS=40
|
// NBASIS=40
|
||||||
@ -379,6 +341,7 @@ slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 it
|
|||||||
}
|
}
|
||||||
MemoryManager::Print();
|
MemoryManager::Print();
|
||||||
|
|
||||||
|
int refine=1;
|
||||||
if(refine){
|
if(refine){
|
||||||
if ( load_refine ) {
|
if ( load_refine ) {
|
||||||
LoadBasis(Aggregates,refine_file);
|
LoadBasis(Aggregates,refine_file);
|
||||||
@ -402,41 +365,14 @@ slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 it
|
|||||||
CoarseVector c_res(Coarse5d);
|
CoarseVector c_res(Coarse5d);
|
||||||
CoarseVector c_ref(Coarse5d);
|
CoarseVector c_ref(Coarse5d);
|
||||||
|
|
||||||
if (0){
|
|
||||||
///////////////////////////////////////////////////
|
|
||||||
// Test the operator
|
|
||||||
///////////////////////////////////////////////////
|
|
||||||
CoarseVector c_proj(Coarse5d);
|
|
||||||
LatticeFermionD tmp(FrbGrid);
|
|
||||||
LatticeFermionD prom(FrbGrid);
|
|
||||||
|
|
||||||
blockPromote(c_src,prom,Aggregates.subspace);
|
|
||||||
|
|
||||||
FineHermOp.HermOp(prom,tmp);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage<<" Calling big dirac op "<<norm2(tmp)<<std::endl;
|
|
||||||
blockProject(c_proj,tmp,Aggregates.subspace);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage<<" Calling little Dirac Op "<<std::endl;
|
|
||||||
|
|
||||||
LittleDiracOp.M(c_src,c_res);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
|
|
||||||
|
|
||||||
c_proj = c_proj - c_res;
|
|
||||||
std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Try projecting to one hop only
|
// Try projecting to one hop only
|
||||||
// LittleDiracOp.ShiftMatrix(1.0e-4);
|
// LittleDiracOp.ShiftMatrix(1.0e-4);
|
||||||
// LittleDiracOperator LittleDiracOpProj(geom_nn,FrbGrid,Coarse5d);
|
LittleDiracOperator LittleDiracOpProj(geom_nn,FrbGrid,Coarse5d);
|
||||||
// LittleDiracOpProj.ProjectNearestNeighbour(0.01,LittleDiracOp); // smaller shift 0.02? n
|
LittleDiracOpProj.ProjectNearestNeighbour(0.01,LittleDiracOp); // smaller shift 0.02? n
|
||||||
|
|
||||||
typedef HermitianLinearOperator<LittleDiracOperator,CoarseVector> HermMatrix;
|
typedef HermitianLinearOperator<LittleDiracOperator,CoarseVector> HermMatrix;
|
||||||
HermMatrix CoarseOp (LittleDiracOp);
|
HermMatrix CoarseOp (LittleDiracOp);
|
||||||
// HermMatrix CoarseOpProj (LittleDiracOpProj);
|
HermMatrix CoarseOpProj (LittleDiracOpProj);
|
||||||
|
|
||||||
MemoryManager::Print();
|
MemoryManager::Print();
|
||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
@ -452,12 +388,12 @@ slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 it
|
|||||||
// int Nm=160;
|
// int Nm=160;
|
||||||
|
|
||||||
// Chebyshev<CoarseVector> IRLCheby(0.005,40.0,201); //319 HDCG iters @ 128//160 nk.
|
// Chebyshev<CoarseVector> IRLCheby(0.005,40.0,201); //319 HDCG iters @ 128//160 nk.
|
||||||
// Chebyshev<CoarseVector> IRLCheby(0.04,40.0,201);
|
Chebyshev<CoarseVector> IRLCheby(0.04,40.0,201); //319 HDCG iters @ 128//160 nk.
|
||||||
int Nk=192;
|
int Nk=192;
|
||||||
int Nm=256;
|
int Nm=256;
|
||||||
int Nstop=Nk;
|
int Nstop=Nk;
|
||||||
|
|
||||||
Chebyshev<CoarseVector> IRLCheby(0.005,40.0,201); // 1 iter
|
// Chebyshev<CoarseVector> IRLCheby(0.010,45.0,201); // 1 iter
|
||||||
FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,CoarseOp);
|
FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,CoarseOp);
|
||||||
PlainHermOp<CoarseVector> IRLOp (CoarseOp);
|
PlainHermOp<CoarseVector> IRLOp (CoarseOp);
|
||||||
|
|
||||||
@ -469,25 +405,14 @@ slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 it
|
|||||||
|
|
||||||
PowerMethod<CoarseVector> cPM; cPM(CoarseOp,c_src);
|
PowerMethod<CoarseVector> cPM; cPM(CoarseOp,c_src);
|
||||||
|
|
||||||
if ( load_evec ) {
|
|
||||||
eval.resize(Nstop);
|
|
||||||
evec.resize(Nstop,Coarse5d);
|
|
||||||
LoadEigenvectors(eval,evec,evec_file,eval_file);
|
|
||||||
} else {
|
|
||||||
IRL.calc(eval,evec,c_src,Nconv);
|
IRL.calc(eval,evec,c_src,Nconv);
|
||||||
assert(Nstop==eval.size());
|
|
||||||
SaveEigenvectors(eval,evec,evec_file,eval_file);
|
|
||||||
}
|
|
||||||
|
|
||||||
DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);
|
DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);
|
||||||
|
|
||||||
MultiRHSDeflation<CoarseVector> MrhsGuesser;
|
|
||||||
|
|
||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
// Build a coarse space solver
|
// Build a coarse space solver
|
||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
int maxit=30000;
|
int maxit=30000;
|
||||||
ConjugateGradient<CoarseVector> CG(1.0e-10,maxit,false);
|
ConjugateGradient<CoarseVector> CG(1.0e-8,maxit,false);
|
||||||
ConjugateGradient<LatticeFermionD> CGfine(1.0e-8,30000,false);
|
ConjugateGradient<LatticeFermionD> CGfine(1.0e-8,30000,false);
|
||||||
ZeroGuesser<CoarseVector> CoarseZeroGuesser;
|
ZeroGuesser<CoarseVector> CoarseZeroGuesser;
|
||||||
|
|
||||||
@ -502,8 +427,8 @@ slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 it
|
|||||||
// Deflated (with real op EV's) solve for the projected coarse op
|
// Deflated (with real op EV's) solve for the projected coarse op
|
||||||
// Work towards ADEF1 in the coarse space
|
// Work towards ADEF1 in the coarse space
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
// HPDSolver<CoarseVector> HPDSolveProj(CoarseOpProj,CG,DeflCoarseGuesser);
|
HPDSolver<CoarseVector> HPDSolveProj(CoarseOpProj,CG,DeflCoarseGuesser);
|
||||||
// c_res=Zero();
|
c_res=Zero();
|
||||||
// HPDSolveProj(c_src,c_res);
|
// HPDSolveProj(c_src,c_res);
|
||||||
// std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
|
// std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
|
||||||
// std::cout << GridLogMessage<<"res norm "<<norm2(c_res)<<std::endl;
|
// std::cout << GridLogMessage<<"res norm "<<norm2(c_res)<<std::endl;
|
||||||
@ -513,7 +438,7 @@ slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 it
|
|||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
// Coarse ADEF1 with deflation space
|
// Coarse ADEF1 with deflation space
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
// ChebyshevSmoother<CoarseVector > CoarseSmoother(1.0,37.,8,CoarseOpProj); // just go to sloppy 0.1 convergence
|
ChebyshevSmoother<CoarseVector > CoarseSmoother(1.0,37.,8,CoarseOpProj); // just go to sloppy 0.1 convergence
|
||||||
// CoarseSmoother(0.1,37.,8,CoarseOpProj); //
|
// CoarseSmoother(0.1,37.,8,CoarseOpProj); //
|
||||||
// CoarseSmoother(0.5,37.,6,CoarseOpProj); // 8 iter 0.36s
|
// CoarseSmoother(0.5,37.,6,CoarseOpProj); // 8 iter 0.36s
|
||||||
// CoarseSmoother(0.5,37.,12,CoarseOpProj); // 8 iter, 0.55s
|
// CoarseSmoother(0.5,37.,12,CoarseOpProj); // 8 iter, 0.55s
|
||||||
@ -591,9 +516,9 @@ slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 it
|
|||||||
// mrhs coarse solve
|
// mrhs coarse solve
|
||||||
// Create a higher dim coarse grid
|
// Create a higher dim coarse grid
|
||||||
//////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////
|
||||||
ConjugateGradient<CoarseVector> coarseCG(4.0e-2,20000,true);
|
ConjugateGradient<CoarseVector> coarseCG(2.0e-2,20000,true);
|
||||||
|
|
||||||
const int nrhs=vComplex::Nsimd()*3;
|
const int nrhs=vComplex::Nsimd();
|
||||||
|
|
||||||
Coordinate mpi=GridDefaultMpi();
|
Coordinate mpi=GridDefaultMpi();
|
||||||
Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
|
Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
|
||||||
@ -606,7 +531,7 @@ slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 it
|
|||||||
typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
|
typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
|
||||||
MrhsHermMatrix MrhsCoarseOp (mrhs);
|
MrhsHermMatrix MrhsCoarseOp (mrhs);
|
||||||
MemoryManager::Print();
|
MemoryManager::Print();
|
||||||
#if 1
|
|
||||||
{
|
{
|
||||||
CoarseVector rh_res(CoarseMrhs);
|
CoarseVector rh_res(CoarseMrhs);
|
||||||
CoarseVector rh_guess(CoarseMrhs);
|
CoarseVector rh_guess(CoarseMrhs);
|
||||||
@ -614,39 +539,19 @@ slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 it
|
|||||||
|
|
||||||
rh_res= Zero();
|
rh_res= Zero();
|
||||||
rh_guess= Zero();
|
rh_guess= Zero();
|
||||||
|
|
||||||
std::cout << "*************************"<<std::endl;
|
|
||||||
std::cout << " MrhsGuesser importing"<<std::endl;
|
|
||||||
std::cout << "*************************"<<std::endl;
|
|
||||||
MrhsGuesser.ImportEigenBasis(evec,eval);
|
|
||||||
std::vector<CoarseVector> BlasGuess(nrhs,Coarse5d);
|
|
||||||
std::vector<CoarseVector> BlasSource(nrhs,Coarse5d);
|
|
||||||
for(int r=0;r<nrhs;r++){
|
for(int r=0;r<nrhs;r++){
|
||||||
random(CRNG,BlasSource[r]);
|
random(CRNG,c_src);
|
||||||
}
|
|
||||||
|
|
||||||
MrhsGuesser.DeflateSources(BlasSource,BlasGuess);
|
|
||||||
|
|
||||||
for(int r=0;r<nrhs;r++){
|
|
||||||
std::cout << "*************************"<<std::endl;
|
|
||||||
std::cout << "**** DeflCoarseGuesser &&&&& "<<std::endl;
|
|
||||||
std::cout << "*************************"<<std::endl;
|
|
||||||
c_src=BlasSource[r];
|
|
||||||
DeflCoarseGuesser(c_src,c_res);
|
DeflCoarseGuesser(c_src,c_res);
|
||||||
std::cout << "Deflated guess "<< norm2(c_res)<<std::endl;
|
|
||||||
std::cout << "Blas deflated guess "<< norm2(BlasGuess[r])<<std::endl;
|
|
||||||
std::cout << "*************************"<<std::endl;
|
|
||||||
BlasGuess[r] = BlasGuess[r] - c_res;
|
|
||||||
std::cout << "Diff " <<norm2(BlasGuess[r])<<std::endl;
|
|
||||||
std::cout << "*************************"<<std::endl;
|
|
||||||
InsertSlice(c_res,rh_res,r,0);
|
InsertSlice(c_res,rh_res,r,0);
|
||||||
InsertSlice(c_res,rh_guess,r,0);
|
InsertSlice(c_res,rh_guess,r,0);
|
||||||
InsertSlice(c_src,rh_src,r,0);
|
InsertSlice(c_src,rh_src,r,0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
MemoryManager::Print();
|
||||||
coarseCG(MrhsCoarseOp,rh_src,rh_res);
|
coarseCG(MrhsCoarseOp,rh_src,rh_res);
|
||||||
|
MemoryManager::Print();
|
||||||
|
//redo with block CG
|
||||||
|
|
||||||
//redo with block CG ?
|
|
||||||
for(int r=0;r<nrhs;r++){
|
for(int r=0;r<nrhs;r++){
|
||||||
std::cout << " compare to single RHS "<<r<<"/"<<nrhs<<std::endl;
|
std::cout << " compare to single RHS "<<r<<"/"<<nrhs<<std::endl;
|
||||||
ExtractSlice(c_src,rh_src,r,0);
|
ExtractSlice(c_src,rh_src,r,0);
|
||||||
@ -661,32 +566,18 @@ slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 it
|
|||||||
assert(diff < 1.0e-1);
|
assert(diff < 1.0e-1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
MemoryManager::Print();
|
||||||
|
|
||||||
//////////////////////////////////////
|
//////////////////////////////////////
|
||||||
// fine solve
|
// fine solve
|
||||||
//////////////////////////////////////
|
//////////////////////////////////////
|
||||||
|
|
||||||
|
|
||||||
// std::vector<RealD> los({2.0,2.5}); // Nbasis 40 == 36,36 iters
|
// std::vector<RealD> los({2.0,2.5}); // Nbasis 40 == 36,36 iters
|
||||||
// std::vector<RealD> los({2.0});
|
std::vector<RealD> los({2.0}); // Nbasis 40 == 36,36 iters
|
||||||
// std::vector<RealD> los({2.5});
|
|
||||||
|
|
||||||
// std::vector<int> ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults)
|
// std::vector<int> ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults)
|
||||||
// std::vector<int> ords({7}); // Nbasis 40 == 40 iters (320 mults)
|
// std::vector<int> ords({7}); // Nbasis 40 == 40 iters (320 mults)
|
||||||
// std::vector<int> ords({9}); // Nbasis 40 == 40 iters (320 mults)
|
std::vector<int> ords({9}); // Nbasis 40 == 40 iters (320 mults)
|
||||||
|
|
||||||
// 148 outer
|
|
||||||
// std::vector<RealD> los({1.0});
|
|
||||||
// std::vector<int> ords({24});
|
|
||||||
|
|
||||||
// 162 outer
|
|
||||||
// std::vector<RealD> los({2.5});
|
|
||||||
// std::vector<int> ords({9});
|
|
||||||
|
|
||||||
// ??? outer
|
|
||||||
std::vector<RealD> los({2.0});
|
|
||||||
std::vector<int> ords({7});
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Smoother opt @56 nbasis, 0.04 convergence, 192 evs
|
Smoother opt @56 nbasis, 0.04 convergence, 192 evs
|
||||||
@ -789,33 +680,31 @@ Conclusion: higher order smoother is doing better. Much better. Use a Krylov smo
|
|||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
// Build a HDCG solver
|
// Build a HDCG solver
|
||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
|
/* TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
|
||||||
HDCG(1.0e-8, 700,
|
HDCG(1.0e-8, 700,
|
||||||
FineHermOp,
|
FineHermOp,
|
||||||
CGsmooth,
|
CGsmooth,
|
||||||
HPDSolveSloppy,
|
HPDSolveSloppy,
|
||||||
HPDSolve,
|
HPDSolve,
|
||||||
Aggregates);
|
Aggregates);
|
||||||
// result=Zero();
|
result=Zero();
|
||||||
// std::cout << "Calling HDCG single RHS"<<std::endl;
|
*/
|
||||||
|
// std::cout << "Calling HDCG"<<std::endl;
|
||||||
// HDCG(src,result);
|
// HDCG(src,result);
|
||||||
|
|
||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
// Build a HDCG mrhs solver
|
// Build a HDCG mrhs solver
|
||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
#if 1
|
|
||||||
MemoryManager::Print();
|
MemoryManager::Print();
|
||||||
DoNothingGuesser<CoarseVector> DoNothing;
|
DoNothingGuesser<CoarseVector> DoNothing;
|
||||||
HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing);
|
HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,coarseCG,DoNothing);
|
||||||
HPDSolver<CoarseVector> HPDSolveMrhsSloppy(MrhsCoarseOp,CGsloppy,DoNothing);
|
|
||||||
TwoLevelADEF2mrhs<LatticeFermion,CoarseVector,Subspace>
|
TwoLevelADEF2mrhs<LatticeFermion,CoarseVector,Subspace>
|
||||||
HDCGmrhs(1.0e-8, 500,
|
HDCGmrhs(1.0e-8, 700,
|
||||||
FineHermOp,
|
FineHermOp,
|
||||||
CGsmooth,
|
CGsmooth,
|
||||||
// HPDSolveSloppy, // Never used
|
HPDSolveSloppy, // Never used
|
||||||
// HPDSolve, // Used in Vstart
|
HPDSolve, // Used in Vstart
|
||||||
HPDSolveMrhsSloppy, // Used in M1
|
HPDSolveMrhs, // Used in M1
|
||||||
HPDSolveMrhs, // Used in Vstart
|
|
||||||
DeflCoarseGuesser, // single RHS guess used in M1
|
DeflCoarseGuesser, // single RHS guess used in M1
|
||||||
CoarseMrhs, // Grid needed to Mrhs grid
|
CoarseMrhs, // Grid needed to Mrhs grid
|
||||||
Aggregates);
|
Aggregates);
|
||||||
@ -826,13 +715,10 @@ Conclusion: higher order smoother is doing better. Much better. Use a Krylov smo
|
|||||||
|
|
||||||
MemoryManager::Print();
|
MemoryManager::Print();
|
||||||
std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid);
|
std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid);
|
||||||
std::cout << " mRHS source"<<std::endl;
|
|
||||||
std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid);
|
std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid);
|
||||||
std::cout << " mRHS result"<<std::endl;
|
|
||||||
MemoryManager::Print();
|
MemoryManager::Print();
|
||||||
random(RNG5,src_mrhs[0]);
|
|
||||||
for(int r=0;r<nrhs;r++){
|
for(int r=0;r<nrhs;r++){
|
||||||
if(r>0)src_mrhs[r]=src_mrhs[0];
|
random(RNG5,src_mrhs[r]);
|
||||||
res_mrhs[r]=Zero();
|
res_mrhs[r]=Zero();
|
||||||
std::cout << "Setup mrhs source "<<r<<std::endl;
|
std::cout << "Setup mrhs source "<<r<<std::endl;
|
||||||
}
|
}
|
||||||
@ -840,19 +726,14 @@ Conclusion: higher order smoother is doing better. Much better. Use a Krylov smo
|
|||||||
MemoryManager::Print();
|
MemoryManager::Print();
|
||||||
HDCGmrhs(src_mrhs,res_mrhs);
|
HDCGmrhs(src_mrhs,res_mrhs);
|
||||||
MemoryManager::Print();
|
MemoryManager::Print();
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Standard CG
|
// Standard CG
|
||||||
#if 1
|
|
||||||
{
|
|
||||||
LatticeFermion result(FrbGrid); result=Zero();
|
|
||||||
LatticeFermion src(FrbGrid); random(RNG5,src);
|
|
||||||
result=Zero();
|
result=Zero();
|
||||||
CGfine(HermOpEO, src, result);
|
CGfine(HermOpEO, src, result);
|
||||||
}
|
|
||||||
#endif
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user