mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
Much faster coarsening
This commit is contained in:
parent
114db3b99d
commit
76c823781e
@ -1,3 +1,14 @@
|
|||||||
|
// blockZaxpy in bockPromote - 3s, 5%
|
||||||
|
// noncoalesced linalg in Preconditionoer ~ 3s 5%
|
||||||
|
// Lancos tuning or replace 10-20s ~ 25%, open ended
|
||||||
|
// setup tuning 5s ~ 8%
|
||||||
|
// -- e.g. ordermin, orderstep tunables.
|
||||||
|
// MdagM path without norm in LinOp code. few seconds
|
||||||
|
|
||||||
|
// Mdir calc blocking kernels
|
||||||
|
// Fuse kernels in blockMaskedInnerProduct
|
||||||
|
// preallocate Vectors in Cayley 5D ~ few percent few seconds
|
||||||
|
|
||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
@ -34,6 +45,34 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
template<class vobj,class CComplex>
|
||||||
|
inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner1,
|
||||||
|
Lattice<CComplex> &CoarseInner2,
|
||||||
|
const Lattice<decltype(innerProduct(vobj(),vobj()))> &FineMask1,
|
||||||
|
const Lattice<decltype(innerProduct(vobj(),vobj()))> &FineMask2,
|
||||||
|
const Lattice<vobj> &fineX,
|
||||||
|
const Lattice<vobj> &fineY)
|
||||||
|
{
|
||||||
|
typedef decltype(innerProduct(vobj(),vobj())) dotp;
|
||||||
|
|
||||||
|
GridBase *coarse(CoarseInner1.Grid());
|
||||||
|
GridBase *fine (fineX.Grid());
|
||||||
|
|
||||||
|
Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
|
||||||
|
Lattice<dotp> fine_inner_msk(fine);
|
||||||
|
|
||||||
|
// Multiply could be fused with innerProduct
|
||||||
|
// Single block sum kernel could do both masks.
|
||||||
|
fine_inner = localInnerProduct(fineX,fineY);
|
||||||
|
|
||||||
|
mult(fine_inner_msk, fine_inner,FineMask1);
|
||||||
|
blockSum(CoarseInner1,fine_inner_msk);
|
||||||
|
|
||||||
|
mult(fine_inner_msk, fine_inner,FineMask2);
|
||||||
|
blockSum(CoarseInner2,fine_inner_msk);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class Geometry {
|
class Geometry {
|
||||||
public:
|
public:
|
||||||
int npoint;
|
int npoint;
|
||||||
@ -51,10 +90,10 @@ public:
|
|||||||
directions.resize(npoint);
|
directions.resize(npoint);
|
||||||
displacements.resize(npoint);
|
displacements.resize(npoint);
|
||||||
for(int d=0;d<_d;d++){
|
for(int d=0;d<_d;d++){
|
||||||
directions[2*d ] = d+base;
|
directions[d ] = d+base;
|
||||||
directions[2*d+1] = d+base;
|
directions[d+_d] = d+base;
|
||||||
displacements[2*d ] = +1;
|
displacements[d ] = +1;
|
||||||
displacements[2*d+1] = -1;
|
displacements[d+_d]= -1;
|
||||||
}
|
}
|
||||||
directions [2*_d]=0;
|
directions [2*_d]=0;
|
||||||
displacements[2*_d]=0;
|
displacements[2*_d]=0;
|
||||||
@ -136,20 +175,15 @@ public:
|
|||||||
std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
|
std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
|
||||||
}
|
}
|
||||||
void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
|
void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
|
||||||
// std::cout << GridLogMessage<< "BlockPromote"<<std::endl;
|
|
||||||
blockProject(CoarseVec,FineVec,subspace);
|
blockProject(CoarseVec,FineVec,subspace);
|
||||||
// std::cout << GridLogMessage<< "BlockPromote"<<std::endl;
|
|
||||||
}
|
}
|
||||||
void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
|
void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
|
||||||
FineVec.Checkerboard() = subspace[0].Checkerboard();
|
FineVec.Checkerboard() = subspace[0].Checkerboard();
|
||||||
// std::cout << GridLogMessage<< "BlockPromote"<<std::endl;
|
|
||||||
blockPromote(CoarseVec,FineVec,subspace);
|
blockPromote(CoarseVec,FineVec,subspace);
|
||||||
// std::cout << GridLogMessage<< "BlockPromote done"<<std::endl;
|
|
||||||
}
|
}
|
||||||
void CreateSubspaceRandom(GridParallelRNG &RNG){
|
void CreateSubspaceRandom(GridParallelRNG &RNG){
|
||||||
for(int i=0;i<nbasis;i++){
|
for(int i=0;i<nbasis;i++){
|
||||||
random(RNG,subspace[i]);
|
random(RNG,subspace[i]);
|
||||||
// std::cout<<GridLogMessage<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -190,12 +224,15 @@ public:
|
|||||||
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
|
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
|
||||||
// and this is the best I found
|
// and this is the best I found
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
#if 1
|
||||||
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
||||||
int nn,
|
int nn,
|
||||||
double hi,
|
double hi,
|
||||||
double lo,
|
double lo,
|
||||||
int order,
|
int orderfilter,
|
||||||
int orderstep
|
int ordermin,
|
||||||
|
int orderstep,
|
||||||
|
double filterlo
|
||||||
) {
|
) {
|
||||||
|
|
||||||
RealD scale;
|
RealD scale;
|
||||||
@ -215,7 +252,7 @@ public:
|
|||||||
int b =0;
|
int b =0;
|
||||||
{
|
{
|
||||||
// Filter
|
// Filter
|
||||||
Chebyshev<FineField> Cheb(lo,hi,order);
|
Chebyshev<FineField> Cheb(lo,hi,orderfilter);
|
||||||
Cheb(hermop,noise,Mn);
|
Cheb(hermop,noise,Mn);
|
||||||
// normalise
|
// normalise
|
||||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
||||||
@ -227,7 +264,7 @@ public:
|
|||||||
|
|
||||||
// Generate a full sequence of Chebyshevs
|
// Generate a full sequence of Chebyshevs
|
||||||
{
|
{
|
||||||
lo=0;
|
lo=filterlo;
|
||||||
noise=Mn;
|
noise=Mn;
|
||||||
|
|
||||||
FineField T0(FineGrid); T0 = noise;
|
FineField T0(FineGrid); T0 = noise;
|
||||||
@ -245,7 +282,7 @@ public:
|
|||||||
hermop.HermOp(T0,y);
|
hermop.HermOp(T0,y);
|
||||||
T1=y*xscale+noise*mscale;
|
T1=y*xscale+noise*mscale;
|
||||||
|
|
||||||
for(int n=2;n<=orderstep*(nn-1);n++){
|
for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
|
||||||
|
|
||||||
hermop.HermOp(*Tn,y);
|
hermop.HermOp(*Tn,y);
|
||||||
|
|
||||||
@ -253,6 +290,7 @@ public:
|
|||||||
auto Tn_v = Tn->View();
|
auto Tn_v = Tn->View();
|
||||||
auto Tnp_v = Tnp->View();
|
auto Tnp_v = Tnp->View();
|
||||||
auto Tnm_v = Tnm->View();
|
auto Tnm_v = Tnm->View();
|
||||||
|
const int Nsimd = CComplex::Nsimd();
|
||||||
accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
|
accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
|
||||||
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
|
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
|
||||||
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
|
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
|
||||||
@ -260,12 +298,14 @@ public:
|
|||||||
|
|
||||||
// Possible more fine grained control is needed than a linear sweep,
|
// Possible more fine grained control is needed than a linear sweep,
|
||||||
// but huge productivity gain if this is simple algorithm and not a tunable
|
// but huge productivity gain if this is simple algorithm and not a tunable
|
||||||
if ( (n%orderstep)==0 ) {
|
int m =1;
|
||||||
|
if ( n>=ordermin ) m=n-ordermin;
|
||||||
|
if ( (m%orderstep)==0 ) {
|
||||||
Mn=*Tnp;
|
Mn=*Tnp;
|
||||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
||||||
subspace[b] = Mn;
|
subspace[b] = Mn;
|
||||||
hermop.Op(Mn,tmp);
|
hermop.Op(Mn,tmp);
|
||||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
||||||
b++;
|
b++;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -279,6 +319,202 @@ public:
|
|||||||
}
|
}
|
||||||
assert(b==nn);
|
assert(b==nn);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
#if 0
|
||||||
|
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
||||||
|
int nn,
|
||||||
|
double hi,
|
||||||
|
double lo,
|
||||||
|
int orderfilter,
|
||||||
|
int ordermin,
|
||||||
|
int orderstep,
|
||||||
|
double filterlo
|
||||||
|
) {
|
||||||
|
|
||||||
|
RealD scale;
|
||||||
|
|
||||||
|
FineField noise(FineGrid);
|
||||||
|
FineField Mn(FineGrid);
|
||||||
|
FineField tmp(FineGrid);
|
||||||
|
FineField combined(FineGrid);
|
||||||
|
|
||||||
|
// New normalised noise
|
||||||
|
gaussian(RNG,noise);
|
||||||
|
scale = std::pow(norm2(noise),-0.5);
|
||||||
|
noise=noise*scale;
|
||||||
|
|
||||||
|
// Initial matrix element
|
||||||
|
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
||||||
|
|
||||||
|
int b =0;
|
||||||
|
#define FILTERb(llo,hhi,oorder) \
|
||||||
|
{ \
|
||||||
|
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
|
||||||
|
Cheb(hermop,noise,Mn); \
|
||||||
|
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \
|
||||||
|
subspace[b] = Mn; \
|
||||||
|
hermop.Op(Mn,tmp); \
|
||||||
|
std::cout<<GridLogMessage << oorder<< " Cheb filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
|
||||||
|
b++; \
|
||||||
|
}
|
||||||
|
|
||||||
|
// JacobiPolynomial<FineField> Cheb(0.002,60.0,1500,-0.5,3.5); \
|
||||||
|
|
||||||
|
RealD alpha=-0.8;
|
||||||
|
RealD beta =-0.8;
|
||||||
|
#define FILTER(llo,hhi,oorder) \
|
||||||
|
{ \
|
||||||
|
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
|
||||||
|
/* JacobiPolynomial<FineField> Cheb(0.0,60.0,oorder,alpha,beta);*/\
|
||||||
|
Cheb(hermop,noise,Mn); \
|
||||||
|
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \
|
||||||
|
subspace[b] = Mn; \
|
||||||
|
hermop.Op(Mn,tmp); \
|
||||||
|
std::cout<<GridLogMessage << oorder<< "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
|
||||||
|
b++; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define FILTERc(llo,hhi,oorder) \
|
||||||
|
{ \
|
||||||
|
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
|
||||||
|
Cheb(hermop,noise,combined); \
|
||||||
|
}
|
||||||
|
|
||||||
|
double node = 0.000;
|
||||||
|
FILTERb(lo,hi,orderfilter);// 0
|
||||||
|
// FILTERc(node,hi,51);// 0
|
||||||
|
noise = Mn;
|
||||||
|
int base = 0;
|
||||||
|
int mult = 100;
|
||||||
|
FILTER(node,hi,base+1*mult);
|
||||||
|
FILTER(node,hi,base+2*mult);
|
||||||
|
FILTER(node,hi,base+3*mult);
|
||||||
|
FILTER(node,hi,base+4*mult);
|
||||||
|
FILTER(node,hi,base+5*mult);
|
||||||
|
FILTER(node,hi,base+6*mult);
|
||||||
|
FILTER(node,hi,base+7*mult);
|
||||||
|
FILTER(node,hi,base+8*mult);
|
||||||
|
FILTER(node,hi,base+9*mult);
|
||||||
|
FILTER(node,hi,base+10*mult);
|
||||||
|
FILTER(node,hi,base+11*mult);
|
||||||
|
FILTER(node,hi,base+12*mult);
|
||||||
|
FILTER(node,hi,base+13*mult);
|
||||||
|
FILTER(node,hi,base+14*mult);
|
||||||
|
FILTER(node,hi,base+15*mult);
|
||||||
|
assert(b==nn);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
||||||
|
int nn,
|
||||||
|
double hi,
|
||||||
|
double lo,
|
||||||
|
int orderfilter,
|
||||||
|
int ordermin,
|
||||||
|
int orderstep,
|
||||||
|
double filterlo
|
||||||
|
) {
|
||||||
|
|
||||||
|
RealD scale;
|
||||||
|
|
||||||
|
FineField noise(FineGrid);
|
||||||
|
FineField Mn(FineGrid);
|
||||||
|
FineField tmp(FineGrid);
|
||||||
|
FineField combined(FineGrid);
|
||||||
|
|
||||||
|
// New normalised noise
|
||||||
|
gaussian(RNG,noise);
|
||||||
|
scale = std::pow(norm2(noise),-0.5);
|
||||||
|
noise=noise*scale;
|
||||||
|
|
||||||
|
// Initial matrix element
|
||||||
|
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
||||||
|
|
||||||
|
int b =0;
|
||||||
|
{
|
||||||
|
Chebyshev<FineField> JacobiPoly(0.005,60.,1500);
|
||||||
|
// JacobiPolynomial<FineField> JacobiPoly(0.002,60.0,1500,-0.5,3.5);
|
||||||
|
//JacobiPolynomial<FineField> JacobiPoly(0.03,60.0,500,-0.5,3.5);
|
||||||
|
// JacobiPolynomial<FineField> JacobiPoly(0.00,60.0,1000,-0.5,3.5);
|
||||||
|
JacobiPoly(hermop,noise,Mn);
|
||||||
|
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
||||||
|
subspace[b] = Mn;
|
||||||
|
hermop.Op(Mn,tmp);
|
||||||
|
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
||||||
|
b++;
|
||||||
|
// scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
|
||||||
|
// subspace[b] = tmp; b++;
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
|
||||||
|
#define FILTER(lambda) \
|
||||||
|
{ \
|
||||||
|
hermop.HermOp(subspace[0],tmp); \
|
||||||
|
tmp = tmp - lambda *subspace[0]; \
|
||||||
|
scale = std::pow(norm2(tmp),-0.5); \
|
||||||
|
tmp=tmp*scale; \
|
||||||
|
subspace[b] = tmp; \
|
||||||
|
hermop.Op(subspace[b],tmp); \
|
||||||
|
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
|
||||||
|
b++; \
|
||||||
|
}
|
||||||
|
// scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
|
||||||
|
// subspace[b] = tmp; b++;
|
||||||
|
// }
|
||||||
|
|
||||||
|
FILTER(2.0e-5);
|
||||||
|
FILTER(2.0e-4);
|
||||||
|
FILTER(4.0e-4);
|
||||||
|
FILTER(8.0e-4);
|
||||||
|
FILTER(8.0e-4);
|
||||||
|
|
||||||
|
FILTER(2.0e-3);
|
||||||
|
FILTER(3.0e-3);
|
||||||
|
FILTER(4.0e-3);
|
||||||
|
FILTER(5.0e-3);
|
||||||
|
FILTER(6.0e-3);
|
||||||
|
|
||||||
|
FILTER(2.5e-3);
|
||||||
|
FILTER(3.5e-3);
|
||||||
|
FILTER(4.5e-3);
|
||||||
|
FILTER(5.5e-3);
|
||||||
|
FILTER(6.5e-3);
|
||||||
|
|
||||||
|
// FILTER(6.0e-5);//6
|
||||||
|
// FILTER(7.0e-5);//8
|
||||||
|
// FILTER(8.0e-5);//9
|
||||||
|
// FILTER(9.0e-5);//3
|
||||||
|
|
||||||
|
/*
|
||||||
|
// FILTER(1.0e-4);//10
|
||||||
|
FILTER(2.0e-4);//11
|
||||||
|
// FILTER(3.0e-4);//12
|
||||||
|
// FILTER(4.0e-4);//13
|
||||||
|
FILTER(5.0e-4);//14
|
||||||
|
|
||||||
|
FILTER(6.0e-3);//4
|
||||||
|
FILTER(7.0e-4);//1
|
||||||
|
FILTER(8.0e-4);//7
|
||||||
|
FILTER(9.0e-4);//15
|
||||||
|
FILTER(1.0e-3);//2
|
||||||
|
|
||||||
|
FILTER(2.0e-3);//2
|
||||||
|
FILTER(3.0e-3);//2
|
||||||
|
FILTER(4.0e-3);//2
|
||||||
|
FILTER(5.0e-3);//2
|
||||||
|
FILTER(6.0e-3);//2
|
||||||
|
|
||||||
|
FILTER(7.0e-3);//2
|
||||||
|
FILTER(8.0e-3);//2
|
||||||
|
FILTER(1.0e-2);//2
|
||||||
|
*/
|
||||||
|
std::cout << GridLogMessage <<"Jacobi filtering done" <<std::endl;
|
||||||
|
assert(b==nn);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Fine Object == (per site) type of fine field
|
// Fine Object == (per site) type of fine field
|
||||||
@ -287,7 +523,8 @@ template<class Fobj,class CComplex,int nbasis>
|
|||||||
class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > {
|
class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
typedef iVector<CComplex,nbasis > siteVector;
|
typedef iVector<CComplex,nbasis > siteVector;
|
||||||
|
typedef Lattice<CComplex > CoarseComplexField;
|
||||||
typedef Lattice<siteVector> CoarseVector;
|
typedef Lattice<siteVector> CoarseVector;
|
||||||
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
|
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
|
||||||
typedef iMatrix<CComplex,nbasis > Cobj;
|
typedef iMatrix<CComplex,nbasis > Cobj;
|
||||||
@ -304,7 +541,6 @@ public:
|
|||||||
CartesianStencil<siteVector,siteVector,int> Stencil;
|
CartesianStencil<siteVector,siteVector,int> Stencil;
|
||||||
|
|
||||||
std::vector<CoarseMatrix> A;
|
std::vector<CoarseMatrix> A;
|
||||||
|
|
||||||
|
|
||||||
///////////////////////
|
///////////////////////
|
||||||
// Interface
|
// Interface
|
||||||
@ -316,7 +552,6 @@ public:
|
|||||||
conformable(_grid,in.Grid());
|
conformable(_grid,in.Grid());
|
||||||
conformable(in.Grid(),out.Grid());
|
conformable(in.Grid(),out.Grid());
|
||||||
|
|
||||||
|
|
||||||
// RealD Nin = norm2(in);
|
// RealD Nin = norm2(in);
|
||||||
SimpleCompressor<siteVector> compressor;
|
SimpleCompressor<siteVector> compressor;
|
||||||
|
|
||||||
@ -333,16 +568,14 @@ public:
|
|||||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||||
|
|
||||||
const int Nsimd = CComplex::Nsimd();
|
const int Nsimd = CComplex::Nsimd();
|
||||||
|
|
||||||
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
||||||
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
|
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
|
||||||
|
|
||||||
GridStopWatch ArithmeticTimer;
|
GridStopWatch ArithmeticTimer;
|
||||||
int osites=Grid()->oSites();
|
int osites=Grid()->oSites();
|
||||||
double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint;
|
// double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint;
|
||||||
double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex);
|
// double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex);
|
||||||
double usecs =-usecond();
|
double usecs =-usecond();
|
||||||
|
|
||||||
// assert(geom.npoint==9);
|
// assert(geom.npoint==9);
|
||||||
|
|
||||||
accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
|
accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
|
||||||
@ -418,7 +651,37 @@ public:
|
|||||||
|
|
||||||
auto out_v = out.View();
|
auto out_v = out.View();
|
||||||
auto in_v = in.View();
|
auto in_v = in.View();
|
||||||
|
|
||||||
|
const int Nsimd = CComplex::Nsimd();
|
||||||
|
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
||||||
|
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
|
||||||
|
|
||||||
|
accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
|
||||||
|
int ss = sss/nbasis;
|
||||||
|
int b = sss%nbasis;
|
||||||
|
calcComplex res = Zero();
|
||||||
|
calcVector nbr;
|
||||||
|
int ptype;
|
||||||
|
StencilEntry *SE;
|
||||||
|
|
||||||
|
int lane=SIMTlane(Nsimd);
|
||||||
|
SE=Stencil.GetEntry(ptype,point,ss);
|
||||||
|
|
||||||
|
if(SE->_is_local) {
|
||||||
|
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
|
||||||
|
} else {
|
||||||
|
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
|
||||||
|
}
|
||||||
|
synchronise();
|
||||||
|
|
||||||
|
for(int bb=0;bb<nbasis;bb++) {
|
||||||
|
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
|
||||||
|
}
|
||||||
|
coalescedWrite(out_v[ss](b),res,lane);
|
||||||
|
});
|
||||||
|
#if 0
|
||||||
accelerator_for(ss,Grid()->oSites(),1,{
|
accelerator_for(ss,Grid()->oSites(),1,{
|
||||||
|
|
||||||
siteVector res = Zero();
|
siteVector res = Zero();
|
||||||
siteVector nbr;
|
siteVector nbr;
|
||||||
int ptype;
|
int ptype;
|
||||||
@ -433,18 +696,23 @@ public:
|
|||||||
} else {
|
} else {
|
||||||
nbr = Stencil.CommBuf()[SE->_offset];
|
nbr = Stencil.CommBuf()[SE->_offset];
|
||||||
}
|
}
|
||||||
|
synchronise();
|
||||||
|
|
||||||
res = res + Aview_p[point][ss]*nbr;
|
res = res + Aview_p[point][ss]*nbr;
|
||||||
|
|
||||||
out_v[ss]=res;
|
out_v[ss]=res;
|
||||||
});
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
|
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
|
||||||
{
|
{
|
||||||
this->MdirComms(in);
|
this->MdirComms(in);
|
||||||
int ndir=geom.npoint-1;
|
int ndir=geom.npoint-1;
|
||||||
assert(out.size()==ndir);
|
if ((out.size()!=ndir)&&(out.size()!=ndir+1)) {
|
||||||
|
std::cout <<"MdirAll out size "<< out.size()<<std::endl;
|
||||||
|
std::cout <<"MdirAll ndir "<< ndir<<std::endl;
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
for(int p=0;p<ndir;p++){
|
for(int p=0;p<ndir;p++){
|
||||||
MdirCalc(in,out[p],p);
|
MdirCalc(in,out[p],p);
|
||||||
}
|
}
|
||||||
@ -496,28 +764,45 @@ public:
|
|||||||
geom(CoarseGrid._ndimension),
|
geom(CoarseGrid._ndimension),
|
||||||
hermitian(hermitian_),
|
hermitian(hermitian_),
|
||||||
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
|
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
|
||||||
A(geom.npoint,&CoarseGrid)
|
A(geom.npoint,&CoarseGrid)
|
||||||
{
|
{
|
||||||
};
|
};
|
||||||
|
|
||||||
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
|
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
|
||||||
Aggregation<Fobj,CComplex,nbasis> & Subspace){
|
Aggregation<Fobj,CComplex,nbasis> & Subspace)
|
||||||
|
{
|
||||||
|
typedef Lattice<typename Fobj::tensor_reduced> FineComplexField;
|
||||||
|
typedef typename Fobj::scalar_type scalar_type;
|
||||||
|
|
||||||
FineField iblock(FineGrid); // contributions from within this block
|
FineComplexField one(FineGrid); one=scalar_type(1.0,0.0);
|
||||||
FineField oblock(FineGrid); // contributions from outwith this block
|
FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0);
|
||||||
|
|
||||||
|
std::vector<FineComplexField> masks(geom.npoint,FineGrid);
|
||||||
|
FineComplexField imask(FineGrid); // contributions from within this block
|
||||||
|
FineComplexField omask(FineGrid); // contributions from outwith this block
|
||||||
|
|
||||||
|
FineComplexField evenmask(FineGrid);
|
||||||
|
FineComplexField oddmask(FineGrid);
|
||||||
|
|
||||||
FineField phi(FineGrid);
|
FineField phi(FineGrid);
|
||||||
FineField tmp(FineGrid);
|
FineField tmp(FineGrid);
|
||||||
FineField zz(FineGrid); zz=Zero();
|
FineField zz(FineGrid); zz=Zero();
|
||||||
FineField Mphi(FineGrid);
|
FineField Mphi(FineGrid);
|
||||||
|
FineField Mphie(FineGrid);
|
||||||
|
FineField Mphio(FineGrid);
|
||||||
std::vector<FineField> Mphi_p(geom.npoint,FineGrid);
|
std::vector<FineField> Mphi_p(geom.npoint,FineGrid);
|
||||||
|
|
||||||
Lattice<iScalar<vInteger> > coor(FineGrid);
|
Lattice<iScalar<vInteger> > coor (FineGrid);
|
||||||
|
Lattice<iScalar<vInteger> > bcoor(FineGrid);
|
||||||
|
Lattice<iScalar<vInteger> > bcb (FineGrid);
|
||||||
|
|
||||||
CoarseVector iProj(Grid());
|
CoarseVector iProj(Grid());
|
||||||
CoarseVector oProj(Grid());
|
CoarseVector oProj(Grid());
|
||||||
CoarseScalar InnerProd(Grid());
|
CoarseVector SelfProj(Grid());
|
||||||
|
CoarseComplexField iZProj(Grid());
|
||||||
|
CoarseComplexField oZProj(Grid());
|
||||||
|
|
||||||
|
CoarseScalar InnerProd(Grid());
|
||||||
|
|
||||||
// Orthogonalise the subblocks over the basis
|
// Orthogonalise the subblocks over the basis
|
||||||
blockOrthogonalise(InnerProd,Subspace.subspace);
|
blockOrthogonalise(InnerProd,Subspace.subspace);
|
||||||
@ -525,22 +810,46 @@ public:
|
|||||||
// Compute the matrix elements of linop between this orthonormal
|
// Compute the matrix elements of linop between this orthonormal
|
||||||
// set of vectors.
|
// set of vectors.
|
||||||
int self_stencil=-1;
|
int self_stencil=-1;
|
||||||
for(int p=0;p<geom.npoint;p++){
|
for(int p=0;p<geom.npoint;p++)
|
||||||
|
{
|
||||||
|
int dir = geom.directions[p];
|
||||||
|
int disp = geom.displacements[p];
|
||||||
A[p]=Zero();
|
A[p]=Zero();
|
||||||
if( geom.displacements[p]==0){
|
if( geom.displacements[p]==0){
|
||||||
self_stencil=p;
|
self_stencil=p;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
|
||||||
|
|
||||||
|
LatticeCoordinate(coor,dir);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////
|
||||||
|
// Work out even and odd block checkerboarding for fast diagonal term
|
||||||
|
///////////////////////////////////////////////////////
|
||||||
|
if ( disp==1 ) {
|
||||||
|
bcb = bcb + div(coor,block);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( disp==0 ) {
|
||||||
|
masks[p]= Zero();
|
||||||
|
} else if ( disp==1 ) {
|
||||||
|
masks[p] = where(mod(coor,block)==(block-1),one,zero);
|
||||||
|
} else if ( disp==-1 ) {
|
||||||
|
masks[p] = where(mod(coor,block)==(Integer)0,one,zero);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
evenmask = where(mod(bcb,2)==(Integer)0,one,zero);
|
||||||
|
oddmask = one-evenmask;
|
||||||
|
|
||||||
assert(self_stencil!=-1);
|
assert(self_stencil!=-1);
|
||||||
|
|
||||||
for(int i=0;i<nbasis;i++){
|
for(int i=0;i<nbasis;i++){
|
||||||
|
|
||||||
phi=Subspace.subspace[i];
|
phi=Subspace.subspace[i];
|
||||||
|
|
||||||
std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" OpDir " << std::endl;
|
// std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
|
||||||
linop.OpDirAll(phi,Mphi_p);
|
linop.OpDirAll(phi,Mphi_p);
|
||||||
std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" OpDir calculated" << std::endl;
|
|
||||||
linop.OpDiag (phi,Mphi_p[geom.npoint-1]);
|
linop.OpDiag (phi,Mphi_p[geom.npoint-1]);
|
||||||
std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" OpDiag calculated" << std::endl;
|
|
||||||
|
|
||||||
for(int p=0;p<geom.npoint;p++){
|
for(int p=0;p<geom.npoint;p++){
|
||||||
|
|
||||||
@ -549,54 +858,66 @@ public:
|
|||||||
int dir = geom.directions[p];
|
int dir = geom.directions[p];
|
||||||
int disp = geom.displacements[p];
|
int disp = geom.displacements[p];
|
||||||
|
|
||||||
Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
|
if ( (disp==-1) || (!hermitian ) ) {
|
||||||
|
|
||||||
LatticeCoordinate(coor,dir);
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
// Pick out contributions coming from this cell and neighbour cell
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
omask = masks[p];
|
||||||
|
imask = one-omask;
|
||||||
|
|
||||||
|
for(int j=0;j<nbasis;j++){
|
||||||
|
|
||||||
|
blockMaskedInnerProduct(iZProj,oZProj,imask,omask,Subspace.subspace[j],Mphi);
|
||||||
|
|
||||||
|
auto iZProj_v = iZProj.View() ;
|
||||||
|
auto oZProj_v = oZProj.View() ;
|
||||||
|
auto A_p = A[p].View();
|
||||||
|
auto A_self = A[self_stencil].View();
|
||||||
|
|
||||||
|
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
}
|
||||||
// Pick out contributions coming from this cell and neighbour cell
|
}
|
||||||
////////////////////////////////////////////////////////////////////////
|
}
|
||||||
if ( disp==0 ) {
|
|
||||||
iblock = Mphi;
|
///////////////////////////////////////////
|
||||||
oblock = Zero();
|
// Faster alternate self coupling.. use hermiticity to save 2x
|
||||||
} else if ( disp==1 ) {
|
///////////////////////////////////////////
|
||||||
oblock = where(mod(coor,block)==(block-1),Mphi,zz);
|
{
|
||||||
iblock = where(mod(coor,block)!=(block-1),Mphi,zz);
|
mult(tmp,phi,evenmask); linop.Op(tmp,Mphie);
|
||||||
} else if ( disp==-1 ) {
|
mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio);
|
||||||
oblock = where(mod(coor,block)==(Integer)0,Mphi,zz);
|
|
||||||
iblock = where(mod(coor,block)!=(Integer)0,Mphi,zz);
|
// tmp = Mphie*evenmask + Mphio*oddmask;
|
||||||
} else {
|
{
|
||||||
assert(0);
|
auto tmp_ = tmp.View();
|
||||||
|
auto evenmask_ = evenmask.View();
|
||||||
|
auto oddmask_ = oddmask.View();
|
||||||
|
auto Mphie_ = Mphie.View();
|
||||||
|
auto Mphio_ = Mphio.View();
|
||||||
|
accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{
|
||||||
|
coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss));
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Could do local inner products,
|
blockProject(SelfProj,tmp,Subspace.subspace);
|
||||||
// and then block pick the IP's.
|
|
||||||
// Ideally write a routine to do two masked block sums at once
|
|
||||||
std::cout << GridLogMessage<< "CoarsenMatrix picked "<<p<< std::endl;
|
|
||||||
Subspace.ProjectToSubspace(iProj,iblock);
|
|
||||||
Subspace.ProjectToSubspace(oProj,oblock);
|
|
||||||
std::cout << GridLogMessage<< "CoarsenMatrix projected"<<p<< std::endl;
|
|
||||||
|
|
||||||
// 4x gain possible in this loop. Profile and identify time loss.
|
auto SelfProj_ = SelfProj.View();
|
||||||
// i) Assume Hermiticity, upper diagonal only (2x)
|
|
||||||
// ii) Local inner product, then pick the local inners and sum. (2x)
|
|
||||||
|
|
||||||
auto iProj_v = iProj.View() ;
|
|
||||||
auto oProj_v = oProj.View() ;
|
|
||||||
auto A_p = A[p].View();
|
|
||||||
auto A_self = A[self_stencil].View();
|
auto A_self = A[self_stencil].View();
|
||||||
accelerator_for(ss, Grid()->oSites(),1,{
|
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{
|
||||||
for(int j=0;j<nbasis;j++){
|
for(int j=0;j<nbasis;j++){
|
||||||
if( disp!= 0 ) {
|
coalescedWrite(A_self[ss](j,i), SelfProj_(ss)(j));
|
||||||
A_p[ss](j,i) = oProj_v[ss](j);
|
|
||||||
}
|
|
||||||
A_self[ss](j,i) = A_self[ss](j,i) + iProj_v[ss](j);
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if(hermitian) {
|
||||||
|
std::cout << GridLogMessage << " ForceHermitian "<<std::endl;
|
||||||
|
ForceHermitian();
|
||||||
|
}
|
||||||
|
// AssertHermitian();
|
||||||
|
// ForceDiagonal();
|
||||||
|
}
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
@ -619,25 +940,26 @@ public:
|
|||||||
std::cout<<GridLogMessage<< iProj <<std::endl;
|
std::cout<<GridLogMessage<< iProj <<std::endl;
|
||||||
std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
|
std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
|
||||||
#endif
|
#endif
|
||||||
/*
|
|
||||||
if(hermitian) {
|
|
||||||
std::cout << GridLogMessage << " ForceHermitian "<<std::endl;
|
|
||||||
ForceHermitian();
|
|
||||||
}
|
|
||||||
for(int p=0;p<geom.npoint;p++){
|
|
||||||
std::cout << GridLogMessage<< " dir "<< norm2(A[p]) <<std::endl;
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
// AssertHermitian();
|
|
||||||
// ForceDiagonal();
|
|
||||||
}
|
|
||||||
|
|
||||||
void ForceHermitian(void) {
|
void ForceHermitian(void) {
|
||||||
for(int d=0;d<4;d++){
|
CoarseMatrix Diff (Grid());
|
||||||
int dd=d+1;
|
for(int p=0;p<geom.npoint;p++){
|
||||||
A[2*d] = adj(Cshift(A[2*d+1],dd,1));
|
int dir = geom.directions[p];
|
||||||
|
int disp = geom.displacements[p];
|
||||||
|
if(disp==-1) {
|
||||||
|
// Find the opposite link
|
||||||
|
for(int pp=0;pp<geom.npoint;pp++){
|
||||||
|
int dirp = geom.directions[pp];
|
||||||
|
int dispp = geom.displacements[pp];
|
||||||
|
if ( (dirp==dir) && (dispp==1) ){
|
||||||
|
// Diff = adj(Cshift(A[p],dir,1)) - A[pp];
|
||||||
|
// std::cout << GridLogMessage<<" Replacing stencil leg "<<pp<<" with leg "<<p<< " diff "<<norm2(Diff) <<std::endl;
|
||||||
|
A[pp] = adj(Cshift(A[p],dir,1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// A[8] = 0.5*(A[8] + adj(A[8]));
|
|
||||||
}
|
}
|
||||||
void AssertHermitian(void) {
|
void AssertHermitian(void) {
|
||||||
CoarseMatrix AA (Grid());
|
CoarseMatrix AA (Grid());
|
||||||
|
Loading…
x
Reference in New Issue
Block a user