mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-10 14:10:46 +01:00
Much faster coarsening
This commit is contained in:
parent
114db3b99d
commit
76c823781e
@ -1,3 +1,14 @@
|
||||
// blockZaxpy in bockPromote - 3s, 5%
|
||||
// noncoalesced linalg in Preconditionoer ~ 3s 5%
|
||||
// Lancos tuning or replace 10-20s ~ 25%, open ended
|
||||
// setup tuning 5s ~ 8%
|
||||
// -- e.g. ordermin, orderstep tunables.
|
||||
// MdagM path without norm in LinOp code. few seconds
|
||||
|
||||
// Mdir calc blocking kernels
|
||||
// Fuse kernels in blockMaskedInnerProduct
|
||||
// preallocate Vectors in Cayley 5D ~ few percent few seconds
|
||||
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
@ -34,6 +45,34 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class vobj,class CComplex>
|
||||
inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner1,
|
||||
Lattice<CComplex> &CoarseInner2,
|
||||
const Lattice<decltype(innerProduct(vobj(),vobj()))> &FineMask1,
|
||||
const Lattice<decltype(innerProduct(vobj(),vobj()))> &FineMask2,
|
||||
const Lattice<vobj> &fineX,
|
||||
const Lattice<vobj> &fineY)
|
||||
{
|
||||
typedef decltype(innerProduct(vobj(),vobj())) dotp;
|
||||
|
||||
GridBase *coarse(CoarseInner1.Grid());
|
||||
GridBase *fine (fineX.Grid());
|
||||
|
||||
Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
|
||||
Lattice<dotp> fine_inner_msk(fine);
|
||||
|
||||
// Multiply could be fused with innerProduct
|
||||
// Single block sum kernel could do both masks.
|
||||
fine_inner = localInnerProduct(fineX,fineY);
|
||||
|
||||
mult(fine_inner_msk, fine_inner,FineMask1);
|
||||
blockSum(CoarseInner1,fine_inner_msk);
|
||||
|
||||
mult(fine_inner_msk, fine_inner,FineMask2);
|
||||
blockSum(CoarseInner2,fine_inner_msk);
|
||||
}
|
||||
|
||||
|
||||
class Geometry {
|
||||
public:
|
||||
int npoint;
|
||||
@ -51,10 +90,10 @@ public:
|
||||
directions.resize(npoint);
|
||||
displacements.resize(npoint);
|
||||
for(int d=0;d<_d;d++){
|
||||
directions[2*d ] = d+base;
|
||||
directions[2*d+1] = d+base;
|
||||
displacements[2*d ] = +1;
|
||||
displacements[2*d+1] = -1;
|
||||
directions[d ] = d+base;
|
||||
directions[d+_d] = d+base;
|
||||
displacements[d ] = +1;
|
||||
displacements[d+_d]= -1;
|
||||
}
|
||||
directions [2*_d]=0;
|
||||
displacements[2*_d]=0;
|
||||
@ -136,20 +175,15 @@ public:
|
||||
std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
|
||||
}
|
||||
void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
|
||||
// std::cout << GridLogMessage<< "BlockPromote"<<std::endl;
|
||||
blockProject(CoarseVec,FineVec,subspace);
|
||||
// std::cout << GridLogMessage<< "BlockPromote"<<std::endl;
|
||||
}
|
||||
void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
|
||||
FineVec.Checkerboard() = subspace[0].Checkerboard();
|
||||
// std::cout << GridLogMessage<< "BlockPromote"<<std::endl;
|
||||
blockPromote(CoarseVec,FineVec,subspace);
|
||||
// std::cout << GridLogMessage<< "BlockPromote done"<<std::endl;
|
||||
}
|
||||
void CreateSubspaceRandom(GridParallelRNG &RNG){
|
||||
for(int i=0;i<nbasis;i++){
|
||||
random(RNG,subspace[i]);
|
||||
// std::cout<<GridLogMessage<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
@ -190,12 +224,15 @@ public:
|
||||
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
|
||||
// and this is the best I found
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#if 1
|
||||
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
||||
int nn,
|
||||
double hi,
|
||||
double lo,
|
||||
int order,
|
||||
int orderstep
|
||||
int orderfilter,
|
||||
int ordermin,
|
||||
int orderstep,
|
||||
double filterlo
|
||||
) {
|
||||
|
||||
RealD scale;
|
||||
@ -215,7 +252,7 @@ public:
|
||||
int b =0;
|
||||
{
|
||||
// Filter
|
||||
Chebyshev<FineField> Cheb(lo,hi,order);
|
||||
Chebyshev<FineField> Cheb(lo,hi,orderfilter);
|
||||
Cheb(hermop,noise,Mn);
|
||||
// normalise
|
||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
||||
@ -227,7 +264,7 @@ public:
|
||||
|
||||
// Generate a full sequence of Chebyshevs
|
||||
{
|
||||
lo=0;
|
||||
lo=filterlo;
|
||||
noise=Mn;
|
||||
|
||||
FineField T0(FineGrid); T0 = noise;
|
||||
@ -245,7 +282,7 @@ public:
|
||||
hermop.HermOp(T0,y);
|
||||
T1=y*xscale+noise*mscale;
|
||||
|
||||
for(int n=2;n<=orderstep*(nn-1);n++){
|
||||
for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
|
||||
|
||||
hermop.HermOp(*Tn,y);
|
||||
|
||||
@ -253,6 +290,7 @@ public:
|
||||
auto Tn_v = Tn->View();
|
||||
auto Tnp_v = Tnp->View();
|
||||
auto Tnm_v = Tnm->View();
|
||||
const int Nsimd = CComplex::Nsimd();
|
||||
accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
|
||||
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
|
||||
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
|
||||
@ -260,12 +298,14 @@ public:
|
||||
|
||||
// Possible more fine grained control is needed than a linear sweep,
|
||||
// but huge productivity gain if this is simple algorithm and not a tunable
|
||||
if ( (n%orderstep)==0 ) {
|
||||
int m =1;
|
||||
if ( n>=ordermin ) m=n-ordermin;
|
||||
if ( (m%orderstep)==0 ) {
|
||||
Mn=*Tnp;
|
||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
||||
subspace[b] = Mn;
|
||||
hermop.Op(Mn,tmp);
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
||||
std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
||||
b++;
|
||||
}
|
||||
|
||||
@ -279,6 +319,202 @@ public:
|
||||
}
|
||||
assert(b==nn);
|
||||
}
|
||||
#endif
|
||||
#if 0
|
||||
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
||||
int nn,
|
||||
double hi,
|
||||
double lo,
|
||||
int orderfilter,
|
||||
int ordermin,
|
||||
int orderstep,
|
||||
double filterlo
|
||||
) {
|
||||
|
||||
RealD scale;
|
||||
|
||||
FineField noise(FineGrid);
|
||||
FineField Mn(FineGrid);
|
||||
FineField tmp(FineGrid);
|
||||
FineField combined(FineGrid);
|
||||
|
||||
// New normalised noise
|
||||
gaussian(RNG,noise);
|
||||
scale = std::pow(norm2(noise),-0.5);
|
||||
noise=noise*scale;
|
||||
|
||||
// Initial matrix element
|
||||
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
||||
|
||||
int b =0;
|
||||
#define FILTERb(llo,hhi,oorder) \
|
||||
{ \
|
||||
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
|
||||
Cheb(hermop,noise,Mn); \
|
||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \
|
||||
subspace[b] = Mn; \
|
||||
hermop.Op(Mn,tmp); \
|
||||
std::cout<<GridLogMessage << oorder<< " Cheb filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
|
||||
b++; \
|
||||
}
|
||||
|
||||
// JacobiPolynomial<FineField> Cheb(0.002,60.0,1500,-0.5,3.5); \
|
||||
|
||||
RealD alpha=-0.8;
|
||||
RealD beta =-0.8;
|
||||
#define FILTER(llo,hhi,oorder) \
|
||||
{ \
|
||||
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
|
||||
/* JacobiPolynomial<FineField> Cheb(0.0,60.0,oorder,alpha,beta);*/\
|
||||
Cheb(hermop,noise,Mn); \
|
||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \
|
||||
subspace[b] = Mn; \
|
||||
hermop.Op(Mn,tmp); \
|
||||
std::cout<<GridLogMessage << oorder<< "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
|
||||
b++; \
|
||||
}
|
||||
|
||||
#define FILTERc(llo,hhi,oorder) \
|
||||
{ \
|
||||
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
|
||||
Cheb(hermop,noise,combined); \
|
||||
}
|
||||
|
||||
double node = 0.000;
|
||||
FILTERb(lo,hi,orderfilter);// 0
|
||||
// FILTERc(node,hi,51);// 0
|
||||
noise = Mn;
|
||||
int base = 0;
|
||||
int mult = 100;
|
||||
FILTER(node,hi,base+1*mult);
|
||||
FILTER(node,hi,base+2*mult);
|
||||
FILTER(node,hi,base+3*mult);
|
||||
FILTER(node,hi,base+4*mult);
|
||||
FILTER(node,hi,base+5*mult);
|
||||
FILTER(node,hi,base+6*mult);
|
||||
FILTER(node,hi,base+7*mult);
|
||||
FILTER(node,hi,base+8*mult);
|
||||
FILTER(node,hi,base+9*mult);
|
||||
FILTER(node,hi,base+10*mult);
|
||||
FILTER(node,hi,base+11*mult);
|
||||
FILTER(node,hi,base+12*mult);
|
||||
FILTER(node,hi,base+13*mult);
|
||||
FILTER(node,hi,base+14*mult);
|
||||
FILTER(node,hi,base+15*mult);
|
||||
assert(b==nn);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
||||
int nn,
|
||||
double hi,
|
||||
double lo,
|
||||
int orderfilter,
|
||||
int ordermin,
|
||||
int orderstep,
|
||||
double filterlo
|
||||
) {
|
||||
|
||||
RealD scale;
|
||||
|
||||
FineField noise(FineGrid);
|
||||
FineField Mn(FineGrid);
|
||||
FineField tmp(FineGrid);
|
||||
FineField combined(FineGrid);
|
||||
|
||||
// New normalised noise
|
||||
gaussian(RNG,noise);
|
||||
scale = std::pow(norm2(noise),-0.5);
|
||||
noise=noise*scale;
|
||||
|
||||
// Initial matrix element
|
||||
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
||||
|
||||
int b =0;
|
||||
{
|
||||
Chebyshev<FineField> JacobiPoly(0.005,60.,1500);
|
||||
// JacobiPolynomial<FineField> JacobiPoly(0.002,60.0,1500,-0.5,3.5);
|
||||
//JacobiPolynomial<FineField> JacobiPoly(0.03,60.0,500,-0.5,3.5);
|
||||
// JacobiPolynomial<FineField> JacobiPoly(0.00,60.0,1000,-0.5,3.5);
|
||||
JacobiPoly(hermop,noise,Mn);
|
||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
||||
subspace[b] = Mn;
|
||||
hermop.Op(Mn,tmp);
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
||||
b++;
|
||||
// scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
|
||||
// subspace[b] = tmp; b++;
|
||||
// }
|
||||
}
|
||||
|
||||
#define FILTER(lambda) \
|
||||
{ \
|
||||
hermop.HermOp(subspace[0],tmp); \
|
||||
tmp = tmp - lambda *subspace[0]; \
|
||||
scale = std::pow(norm2(tmp),-0.5); \
|
||||
tmp=tmp*scale; \
|
||||
subspace[b] = tmp; \
|
||||
hermop.Op(subspace[b],tmp); \
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
|
||||
b++; \
|
||||
}
|
||||
// scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
|
||||
// subspace[b] = tmp; b++;
|
||||
// }
|
||||
|
||||
FILTER(2.0e-5);
|
||||
FILTER(2.0e-4);
|
||||
FILTER(4.0e-4);
|
||||
FILTER(8.0e-4);
|
||||
FILTER(8.0e-4);
|
||||
|
||||
FILTER(2.0e-3);
|
||||
FILTER(3.0e-3);
|
||||
FILTER(4.0e-3);
|
||||
FILTER(5.0e-3);
|
||||
FILTER(6.0e-3);
|
||||
|
||||
FILTER(2.5e-3);
|
||||
FILTER(3.5e-3);
|
||||
FILTER(4.5e-3);
|
||||
FILTER(5.5e-3);
|
||||
FILTER(6.5e-3);
|
||||
|
||||
// FILTER(6.0e-5);//6
|
||||
// FILTER(7.0e-5);//8
|
||||
// FILTER(8.0e-5);//9
|
||||
// FILTER(9.0e-5);//3
|
||||
|
||||
/*
|
||||
// FILTER(1.0e-4);//10
|
||||
FILTER(2.0e-4);//11
|
||||
// FILTER(3.0e-4);//12
|
||||
// FILTER(4.0e-4);//13
|
||||
FILTER(5.0e-4);//14
|
||||
|
||||
FILTER(6.0e-3);//4
|
||||
FILTER(7.0e-4);//1
|
||||
FILTER(8.0e-4);//7
|
||||
FILTER(9.0e-4);//15
|
||||
FILTER(1.0e-3);//2
|
||||
|
||||
FILTER(2.0e-3);//2
|
||||
FILTER(3.0e-3);//2
|
||||
FILTER(4.0e-3);//2
|
||||
FILTER(5.0e-3);//2
|
||||
FILTER(6.0e-3);//2
|
||||
|
||||
FILTER(7.0e-3);//2
|
||||
FILTER(8.0e-3);//2
|
||||
FILTER(1.0e-2);//2
|
||||
*/
|
||||
std::cout << GridLogMessage <<"Jacobi filtering done" <<std::endl;
|
||||
assert(b==nn);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
};
|
||||
|
||||
// Fine Object == (per site) type of fine field
|
||||
@ -288,6 +524,7 @@ class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis
|
||||
public:
|
||||
|
||||
typedef iVector<CComplex,nbasis > siteVector;
|
||||
typedef Lattice<CComplex > CoarseComplexField;
|
||||
typedef Lattice<siteVector> CoarseVector;
|
||||
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
|
||||
typedef iMatrix<CComplex,nbasis > Cobj;
|
||||
@ -305,7 +542,6 @@ public:
|
||||
|
||||
std::vector<CoarseMatrix> A;
|
||||
|
||||
|
||||
///////////////////////
|
||||
// Interface
|
||||
///////////////////////
|
||||
@ -316,7 +552,6 @@ public:
|
||||
conformable(_grid,in.Grid());
|
||||
conformable(in.Grid(),out.Grid());
|
||||
|
||||
|
||||
// RealD Nin = norm2(in);
|
||||
SimpleCompressor<siteVector> compressor;
|
||||
|
||||
@ -333,16 +568,14 @@ public:
|
||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||
|
||||
const int Nsimd = CComplex::Nsimd();
|
||||
|
||||
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
||||
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
|
||||
|
||||
GridStopWatch ArithmeticTimer;
|
||||
int osites=Grid()->oSites();
|
||||
double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint;
|
||||
double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex);
|
||||
// double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint;
|
||||
// double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex);
|
||||
double usecs =-usecond();
|
||||
|
||||
// assert(geom.npoint==9);
|
||||
|
||||
accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
|
||||
@ -418,7 +651,37 @@ public:
|
||||
|
||||
auto out_v = out.View();
|
||||
auto in_v = in.View();
|
||||
|
||||
const int Nsimd = CComplex::Nsimd();
|
||||
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
||||
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
|
||||
|
||||
accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
|
||||
int ss = sss/nbasis;
|
||||
int b = sss%nbasis;
|
||||
calcComplex res = Zero();
|
||||
calcVector nbr;
|
||||
int ptype;
|
||||
StencilEntry *SE;
|
||||
|
||||
int lane=SIMTlane(Nsimd);
|
||||
SE=Stencil.GetEntry(ptype,point,ss);
|
||||
|
||||
if(SE->_is_local) {
|
||||
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
|
||||
} else {
|
||||
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
|
||||
}
|
||||
synchronise();
|
||||
|
||||
for(int bb=0;bb<nbasis;bb++) {
|
||||
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
|
||||
}
|
||||
coalescedWrite(out_v[ss](b),res,lane);
|
||||
});
|
||||
#if 0
|
||||
accelerator_for(ss,Grid()->oSites(),1,{
|
||||
|
||||
siteVector res = Zero();
|
||||
siteVector nbr;
|
||||
int ptype;
|
||||
@ -433,18 +696,23 @@ public:
|
||||
} else {
|
||||
nbr = Stencil.CommBuf()[SE->_offset];
|
||||
}
|
||||
synchronise();
|
||||
|
||||
res = res + Aview_p[point][ss]*nbr;
|
||||
|
||||
out_v[ss]=res;
|
||||
});
|
||||
|
||||
#endif
|
||||
}
|
||||
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
|
||||
{
|
||||
this->MdirComms(in);
|
||||
int ndir=geom.npoint-1;
|
||||
assert(out.size()==ndir);
|
||||
if ((out.size()!=ndir)&&(out.size()!=ndir+1)) {
|
||||
std::cout <<"MdirAll out size "<< out.size()<<std::endl;
|
||||
std::cout <<"MdirAll ndir "<< ndir<<std::endl;
|
||||
assert(0);
|
||||
}
|
||||
for(int p=0;p<ndir;p++){
|
||||
MdirCalc(in,out[p],p);
|
||||
}
|
||||
@ -501,23 +769,40 @@ public:
|
||||
};
|
||||
|
||||
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
|
||||
Aggregation<Fobj,CComplex,nbasis> & Subspace){
|
||||
Aggregation<Fobj,CComplex,nbasis> & Subspace)
|
||||
{
|
||||
typedef Lattice<typename Fobj::tensor_reduced> FineComplexField;
|
||||
typedef typename Fobj::scalar_type scalar_type;
|
||||
|
||||
FineField iblock(FineGrid); // contributions from within this block
|
||||
FineField oblock(FineGrid); // contributions from outwith this block
|
||||
FineComplexField one(FineGrid); one=scalar_type(1.0,0.0);
|
||||
FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0);
|
||||
|
||||
std::vector<FineComplexField> masks(geom.npoint,FineGrid);
|
||||
FineComplexField imask(FineGrid); // contributions from within this block
|
||||
FineComplexField omask(FineGrid); // contributions from outwith this block
|
||||
|
||||
FineComplexField evenmask(FineGrid);
|
||||
FineComplexField oddmask(FineGrid);
|
||||
|
||||
FineField phi(FineGrid);
|
||||
FineField tmp(FineGrid);
|
||||
FineField zz(FineGrid); zz=Zero();
|
||||
FineField Mphi(FineGrid);
|
||||
FineField Mphie(FineGrid);
|
||||
FineField Mphio(FineGrid);
|
||||
std::vector<FineField> Mphi_p(geom.npoint,FineGrid);
|
||||
|
||||
Lattice<iScalar<vInteger> > coor (FineGrid);
|
||||
Lattice<iScalar<vInteger> > bcoor(FineGrid);
|
||||
Lattice<iScalar<vInteger> > bcb (FineGrid);
|
||||
|
||||
CoarseVector iProj(Grid());
|
||||
CoarseVector oProj(Grid());
|
||||
CoarseScalar InnerProd(Grid());
|
||||
CoarseVector SelfProj(Grid());
|
||||
CoarseComplexField iZProj(Grid());
|
||||
CoarseComplexField oZProj(Grid());
|
||||
|
||||
CoarseScalar InnerProd(Grid());
|
||||
|
||||
// Orthogonalise the subblocks over the basis
|
||||
blockOrthogonalise(InnerProd,Subspace.subspace);
|
||||
@ -525,22 +810,46 @@ public:
|
||||
// Compute the matrix elements of linop between this orthonormal
|
||||
// set of vectors.
|
||||
int self_stencil=-1;
|
||||
for(int p=0;p<geom.npoint;p++){
|
||||
for(int p=0;p<geom.npoint;p++)
|
||||
{
|
||||
int dir = geom.directions[p];
|
||||
int disp = geom.displacements[p];
|
||||
A[p]=Zero();
|
||||
if( geom.displacements[p]==0){
|
||||
self_stencil=p;
|
||||
}
|
||||
|
||||
Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
|
||||
|
||||
LatticeCoordinate(coor,dir);
|
||||
|
||||
///////////////////////////////////////////////////////
|
||||
// Work out even and odd block checkerboarding for fast diagonal term
|
||||
///////////////////////////////////////////////////////
|
||||
if ( disp==1 ) {
|
||||
bcb = bcb + div(coor,block);
|
||||
}
|
||||
|
||||
if ( disp==0 ) {
|
||||
masks[p]= Zero();
|
||||
} else if ( disp==1 ) {
|
||||
masks[p] = where(mod(coor,block)==(block-1),one,zero);
|
||||
} else if ( disp==-1 ) {
|
||||
masks[p] = where(mod(coor,block)==(Integer)0,one,zero);
|
||||
}
|
||||
}
|
||||
evenmask = where(mod(bcb,2)==(Integer)0,one,zero);
|
||||
oddmask = one-evenmask;
|
||||
|
||||
assert(self_stencil!=-1);
|
||||
|
||||
for(int i=0;i<nbasis;i++){
|
||||
|
||||
phi=Subspace.subspace[i];
|
||||
|
||||
std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" OpDir " << std::endl;
|
||||
// std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
|
||||
linop.OpDirAll(phi,Mphi_p);
|
||||
std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" OpDir calculated" << std::endl;
|
||||
linop.OpDiag (phi,Mphi_p[geom.npoint-1]);
|
||||
std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" OpDiag calculated" << std::endl;
|
||||
|
||||
for(int p=0;p<geom.npoint;p++){
|
||||
|
||||
@ -549,54 +858,66 @@ public:
|
||||
int dir = geom.directions[p];
|
||||
int disp = geom.displacements[p];
|
||||
|
||||
Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
|
||||
|
||||
LatticeCoordinate(coor,dir);
|
||||
|
||||
if ( (disp==-1) || (!hermitian ) ) {
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Pick out contributions coming from this cell and neighbour cell
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
if ( disp==0 ) {
|
||||
iblock = Mphi;
|
||||
oblock = Zero();
|
||||
} else if ( disp==1 ) {
|
||||
oblock = where(mod(coor,block)==(block-1),Mphi,zz);
|
||||
iblock = where(mod(coor,block)!=(block-1),Mphi,zz);
|
||||
} else if ( disp==-1 ) {
|
||||
oblock = where(mod(coor,block)==(Integer)0,Mphi,zz);
|
||||
iblock = where(mod(coor,block)!=(Integer)0,Mphi,zz);
|
||||
} else {
|
||||
assert(0);
|
||||
}
|
||||
omask = masks[p];
|
||||
imask = one-omask;
|
||||
|
||||
// Could do local inner products,
|
||||
// and then block pick the IP's.
|
||||
// Ideally write a routine to do two masked block sums at once
|
||||
std::cout << GridLogMessage<< "CoarsenMatrix picked "<<p<< std::endl;
|
||||
Subspace.ProjectToSubspace(iProj,iblock);
|
||||
Subspace.ProjectToSubspace(oProj,oblock);
|
||||
std::cout << GridLogMessage<< "CoarsenMatrix projected"<<p<< std::endl;
|
||||
for(int j=0;j<nbasis;j++){
|
||||
|
||||
// 4x gain possible in this loop. Profile and identify time loss.
|
||||
// i) Assume Hermiticity, upper diagonal only (2x)
|
||||
// ii) Local inner product, then pick the local inners and sum. (2x)
|
||||
blockMaskedInnerProduct(iZProj,oZProj,imask,omask,Subspace.subspace[j],Mphi);
|
||||
|
||||
auto iProj_v = iProj.View() ;
|
||||
auto oProj_v = oProj.View() ;
|
||||
auto iZProj_v = iZProj.View() ;
|
||||
auto oZProj_v = oZProj.View() ;
|
||||
auto A_p = A[p].View();
|
||||
auto A_self = A[self_stencil].View();
|
||||
accelerator_for(ss, Grid()->oSites(),1,{
|
||||
for(int j=0;j<nbasis;j++){
|
||||
if( disp!= 0 ) {
|
||||
A_p[ss](j,i) = oProj_v[ss](j);
|
||||
}
|
||||
A_self[ss](j,i) = A_self[ss](j,i) + iProj_v[ss](j);
|
||||
}
|
||||
});
|
||||
|
||||
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Faster alternate self coupling.. use hermiticity to save 2x
|
||||
///////////////////////////////////////////
|
||||
{
|
||||
mult(tmp,phi,evenmask); linop.Op(tmp,Mphie);
|
||||
mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio);
|
||||
|
||||
// tmp = Mphie*evenmask + Mphio*oddmask;
|
||||
{
|
||||
auto tmp_ = tmp.View();
|
||||
auto evenmask_ = evenmask.View();
|
||||
auto oddmask_ = oddmask.View();
|
||||
auto Mphie_ = Mphie.View();
|
||||
auto Mphio_ = Mphio.View();
|
||||
accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{
|
||||
coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss));
|
||||
});
|
||||
}
|
||||
|
||||
blockProject(SelfProj,tmp,Subspace.subspace);
|
||||
|
||||
auto SelfProj_ = SelfProj.View();
|
||||
auto A_self = A[self_stencil].View();
|
||||
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{
|
||||
for(int j=0;j<nbasis;j++){
|
||||
coalescedWrite(A_self[ss](j,i), SelfProj_(ss)(j));
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
if(hermitian) {
|
||||
std::cout << GridLogMessage << " ForceHermitian "<<std::endl;
|
||||
ForceHermitian();
|
||||
}
|
||||
// AssertHermitian();
|
||||
// ForceDiagonal();
|
||||
}
|
||||
|
||||
#if 0
|
||||
///////////////////////////
|
||||
@ -619,25 +940,26 @@ public:
|
||||
std::cout<<GridLogMessage<< iProj <<std::endl;
|
||||
std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
|
||||
#endif
|
||||
/*
|
||||
if(hermitian) {
|
||||
std::cout << GridLogMessage << " ForceHermitian "<<std::endl;
|
||||
ForceHermitian();
|
||||
}
|
||||
for(int p=0;p<geom.npoint;p++){
|
||||
std::cout << GridLogMessage<< " dir "<< norm2(A[p]) <<std::endl;
|
||||
}
|
||||
*/
|
||||
// AssertHermitian();
|
||||
// ForceDiagonal();
|
||||
}
|
||||
|
||||
|
||||
void ForceHermitian(void) {
|
||||
for(int d=0;d<4;d++){
|
||||
int dd=d+1;
|
||||
A[2*d] = adj(Cshift(A[2*d+1],dd,1));
|
||||
CoarseMatrix Diff (Grid());
|
||||
for(int p=0;p<geom.npoint;p++){
|
||||
int dir = geom.directions[p];
|
||||
int disp = geom.displacements[p];
|
||||
if(disp==-1) {
|
||||
// Find the opposite link
|
||||
for(int pp=0;pp<geom.npoint;pp++){
|
||||
int dirp = geom.directions[pp];
|
||||
int dispp = geom.displacements[pp];
|
||||
if ( (dirp==dir) && (dispp==1) ){
|
||||
// Diff = adj(Cshift(A[p],dir,1)) - A[pp];
|
||||
// std::cout << GridLogMessage<<" Replacing stencil leg "<<pp<<" with leg "<<p<< " diff "<<norm2(Diff) <<std::endl;
|
||||
A[pp] = adj(Cshift(A[p],dir,1));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// A[8] = 0.5*(A[8] + adj(A[8]));
|
||||
}
|
||||
void AssertHermitian(void) {
|
||||
CoarseMatrix AA (Grid());
|
||||
|
Loading…
x
Reference in New Issue
Block a user